Nina Odoux

NOTEBOOK 2¶

Este cuaderno se enfocara en data splitting y EDA analisis del data set¶

  1. VERIFICAR BALANCE
  2. SPLIT TRAIN/TEST
  3. CATEGORICAL AND CONTINUOUS
  4. UNIVARIABLE ANALISIS
  5. CONCLUSION UNIVARIABLE

  • For each type CATEGORICAL AND CONTINUOUS analysis of:
  1. OUTLIERS
  2. MISSING VALUES
  3. CORRELATION
In [30]:
#importacion de librerias :
import os
import pandas as pd 
import numpy as np   
import matplotlib.pyplot as plt 
import seaborn as sns      
from sklearn.model_selection import train_test_split
import time
import gc
import psutil
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer


#importar funciones de mi archivo ipynb:
import sys
sys.path.append("../src")  
import eda_utils as eda

seed = 123
pd.set_option('display.max_columns', 123)
pd.set_option('display.max_rows', 5000)

Lectura de datos del preprocesado inicial¶

In [33]:
data = pd.read_csv("../data/preprocessed_data/01_preprocessed_data.csv")
data
Out[33]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 100002 1 CASH LOANS MALE NO YES 0 202500.0 406597.5 24700.5 351000.0 UNACCOMPANIED WORKING SECONDARY / SECONDARY SPECIAL SINGLE / NOT MARRIED HOUSE / APARTMENT 0.018801 -9461 -637 -3648.0 -2120 NaN 1 1 0 1 1 0 LABORERS 1.0 2 2 WEDNESDAY 10 0 0 0 0 0 0 BUSINESS ENTITY TYPE 3 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 0.9722 0.6341 0.0144 0.0000 0.0690 0.0833 0.1250 0.0377 0.0220 0.0198 0.0 0.0000 0.0250 0.0369 0.9722 0.6243 0.0144 0.00 0.0690 0.0833 0.1250 0.0375 0.0205 0.0193 0.0000 0.0000 REG OPER ACCOUNT BLOCK OF FLATS 0.0149 STONE, BRICK NO -1134.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
1 100003 0 CASH LOANS FEMALE NO NO 0 270000.0 1293502.5 35698.5 1129500.0 FAMILY STATE SERVANT HIGHER EDUCATION MARRIED HOUSE / APARTMENT 0.003541 -16765 -1188 -1186.0 -291 NaN 1 1 0 1 1 0 CORE STAFF 2.0 1 1 MONDAY 11 0 0 0 0 0 0 SCHOOL 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 0.9851 0.8040 0.0497 0.0806 0.0345 0.2917 0.3333 0.0128 0.0790 0.0554 0.0 0.0000 0.0968 0.0529 0.9851 0.7987 0.0608 0.08 0.0345 0.2917 0.3333 0.0132 0.0787 0.0558 0.0039 0.0100 REG OPER ACCOUNT BLOCK OF FLATS 0.0714 BLOCK NO -828.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
2 100004 0 REVOLVING LOANS MALE YES YES 0 67500.0 135000.0 6750.0 135000.0 UNACCOMPANIED WORKING SECONDARY / SECONDARY SPECIAL SINGLE / NOT MARRIED HOUSE / APARTMENT 0.010032 -19046 -225 -4260.0 -2531 26.0 1 1 1 1 1 0 LABORERS 1.0 2 2 MONDAY 9 0 0 0 0 0 0 GOVERNMENT NaN 0.555912 0.729567 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN -815.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
3 100006 0 CASH LOANS FEMALE NO YES 0 135000.0 312682.5 29686.5 297000.0 UNACCOMPANIED WORKING SECONDARY / SECONDARY SPECIAL CIVIL MARRIAGE HOUSE / APARTMENT 0.008019 -19005 -3039 -9833.0 -2437 NaN 1 1 0 1 0 0 LABORERS 2.0 2 2 WEDNESDAY 17 0 0 0 0 0 0 BUSINESS ENTITY TYPE 3 NaN 0.650442 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN -617.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN
4 100007 0 CASH LOANS MALE NO YES 0 121500.0 513000.0 21865.5 513000.0 UNACCOMPANIED WORKING SECONDARY / SECONDARY SPECIAL SINGLE / NOT MARRIED HOUSE / APARTMENT 0.028663 -19932 -3038 -4311.0 -3458 NaN 1 1 0 1 0 0 CORE STAFF 1.0 2 2 THURSDAY 11 0 0 0 0 1 1 RELIGION NaN 0.322738 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN -1106.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
307506 456251 0 CASH LOANS MALE NO NO 0 157500.0 254700.0 27558.0 225000.0 UNACCOMPANIED WORKING SECONDARY / SECONDARY SPECIAL SEPARATED WITH PARENTS 0.032561 -9327 -236 -8456.0 -1982 NaN 1 1 0 1 0 0 SALES STAFF 1.0 1 1 THURSDAY 15 0 0 0 0 0 0 SERVICES 0.145570 0.681632 NaN 0.2021 0.0887 0.9876 0.8300 0.0202 0.22 0.1034 0.6042 0.2708 0.0594 0.1484 0.1965 0.0753 0.1095 0.1008 0.0172 0.9782 0.7125 0.0172 0.0806 0.0345 0.4583 0.0417 0.0094 0.0882 0.0853 0.0 0.0125 0.2040 0.0887 0.9876 0.8323 0.0203 0.22 0.1034 0.6042 0.2708 0.0605 0.1509 0.2001 0.0757 0.1118 REG OPER ACCOUNT BLOCK OF FLATS 0.2898 STONE, BRICK NO -273.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN
307507 456252 0 CASH LOANS FEMALE NO YES 0 72000.0 269550.0 12001.5 225000.0 UNACCOMPANIED PENSIONER SECONDARY / SECONDARY SPECIAL WIDOW HOUSE / APARTMENT 0.025164 -20775 365243 -4388.0 -4090 NaN 1 0 0 1 1 0 NaN 1.0 2 2 MONDAY 8 0 0 0 0 0 0 XNA NaN 0.115992 NaN 0.0247 0.0435 0.9727 0.6260 0.0022 0.00 0.1034 0.0833 0.1250 0.0579 0.0202 0.0257 0.0000 0.0000 0.0252 0.0451 0.9727 0.6406 0.0022 0.0000 0.1034 0.0833 0.1250 0.0592 0.0220 0.0267 0.0 0.0000 0.0250 0.0435 0.9727 0.6310 0.0022 0.00 0.1034 0.0833 0.1250 0.0589 0.0205 0.0261 0.0000 0.0000 REG OPER ACCOUNT BLOCK OF FLATS 0.0214 STONE, BRICK NO 0.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN
307508 456253 0 CASH LOANS FEMALE NO YES 0 153000.0 677664.0 29979.0 585000.0 UNACCOMPANIED WORKING HIGHER EDUCATION SEPARATED HOUSE / APARTMENT 0.005002 -14966 -7921 -6737.0 -5150 NaN 1 1 0 1 0 1 MANAGERS 1.0 3 3 THURSDAY 9 0 0 0 0 1 1 SCHOOL 0.744026 0.535722 0.218859 0.1031 0.0862 0.9816 0.7484 0.0123 0.00 0.2069 0.1667 0.2083 NaN 0.0841 0.9279 0.0000 0.0000 0.1050 0.0894 0.9816 0.7583 0.0124 0.0000 0.2069 0.1667 0.2083 NaN 0.0918 0.9667 0.0 0.0000 0.1041 0.0862 0.9816 0.7518 0.0124 0.00 0.2069 0.1667 0.2083 NaN 0.0855 0.9445 0.0000 0.0000 REG OPER ACCOUNT BLOCK OF FLATS 0.7970 PANEL NO -1909.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1.0 0.0 0.0 1.0 0.0 1.0
307509 456254 1 CASH LOANS FEMALE NO YES 0 171000.0 370107.0 20205.0 319500.0 UNACCOMPANIED COMMERCIAL ASSOCIATE SECONDARY / SECONDARY SPECIAL MARRIED HOUSE / APARTMENT 0.005313 -11961 -4786 -2562.0 -931 NaN 1 1 0 1 0 0 LABORERS 2.0 2 2 WEDNESDAY 9 0 0 0 1 1 0 BUSINESS ENTITY TYPE 1 NaN 0.514163 0.661024 0.0124 NaN 0.9771 NaN NaN NaN 0.0690 0.0417 NaN NaN NaN 0.0061 NaN NaN 0.0126 NaN 0.9772 NaN NaN NaN 0.0690 0.0417 NaN NaN NaN 0.0063 NaN NaN 0.0125 NaN 0.9771 NaN NaN NaN 0.0690 0.0417 NaN NaN NaN 0.0062 NaN NaN NaN BLOCK OF FLATS 0.0086 STONE, BRICK NO -322.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
307510 456255 0 CASH LOANS FEMALE NO NO 0 157500.0 675000.0 49117.5 675000.0 UNACCOMPANIED COMMERCIAL ASSOCIATE HIGHER EDUCATION MARRIED HOUSE / APARTMENT 0.046220 -16856 -1262 -5128.0 -410 NaN 1 1 1 1 1 0 LABORERS 2.0 1 1 THURSDAY 20 0 0 0 0 1 1 BUSINESS ENTITY TYPE 3 0.734460 0.708569 0.113922 0.0742 0.0526 0.9881 NaN 0.0176 0.08 0.0690 0.3750 NaN NaN NaN 0.0791 NaN 0.0000 0.0756 0.0546 0.9881 NaN 0.0178 0.0806 0.0690 0.3750 NaN NaN NaN 0.0824 NaN 0.0000 0.0749 0.0526 0.9881 NaN 0.0177 0.08 0.0690 0.3750 NaN NaN NaN 0.0805 NaN 0.0000 NaN BLOCK OF FLATS 0.0718 PANEL NO -787.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 2.0 0.0 1.0

307511 rows × 118 columns

VERIFICAR BALANCE¶

Los datos no parecen seguir un orden específico que impacte la repartición durante el proceso de división. Sin embargo, se observa un desequilibrio en las proporciones de las clases de TARGET que debe ser tratado. Para abordar el desequilibrio de clases, podríamos considerar métodos como el re-sampling, el uso de pesos de clase o realizar un muestreo estratificado para asegurar una representación equilibrada de cada clase durante el entrenamiento del modelo.

STRATIFICATION¶

Asegura que la distribución de clases de la variable objetivo se mantenga tanto en el TRAIN como en el TEST. Esto es importante cuando se trabajan con clases desequilibradas, ya que previene que el conjunto de TEST tenga una distribución desigual de las clases en comparación con el conjunto de datos original.

SEPARACION TRAIN Y TEST¶

porque permite evaluar cómo de bien el modelo generalizara a datos no vistos previamente y se adapta a nuevos datos desconocidos

In [37]:
#features seran (X), que son independientes y target variable sera (y): dependiente en teoria de las variables de X :

#removar nuestra variable objetivo del conjunto X porque es la que es dependiente:
X = data.drop('TARGET', axis=1)  

#asignar y como variable objetivo TARGET:
y = data['TARGET']

#stratified train-test split (80% train, 20% test), enfocado en una stratificacion de la variable TARGET:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=seed
)


data_test = pd.concat([X_test, y_test],axis=1)
data_train = pd.concat([X_train, y_train],axis=1)

# verificacion de la distribucion de las clases en los diferentes conjuntos:
print("Original class distribution:")
print(y.value_counts(normalize=True))  # Proportion in original data
print("\nTrain set class distribution:")
print(y_train.value_counts(normalize=True))  # Proportion in train set
print("\nTest set class distribution:")
print(y_test.value_counts(normalize=True))  # Proportion in test set
Original class distribution:
0    0.919271
1    0.080729
Name: TARGET, dtype: float64

Train set class distribution:
0    0.919271
1    0.080729
Name: TARGET, dtype: float64

Test set class distribution:
0    0.919272
1    0.080728
Name: TARGET, dtype: float64

Stratification permitio que las distribuciones de las clases conserven el mismo patron como lo del dataset de origen¶

  • Eran alrededor del 92% para class 0 y un 8% para class 1

Visualización descriptiva de los datos¶

  • valores nulos por filas
  • por columnas

Por columnas (variable)¶

In [42]:
#cqlcular el % de missing por columna
missing_col = eda.check_missing_values(data_train)

# crear un DataFrame con las columnas ordenadas por % de missing en orden DESC:
missing_col_sorted = missing_col.sort_values(ascending=False)

#reset el index
missing_col_sorted = missing_col_sorted.reset_index()
print(missing_col_sorted)
% of NAN per column:
 CODE_GENDER                      0.001219
AMT_ANNUITY                      0.004471
AMT_GOODS_PRICE                  0.085770
NAME_TYPE_SUITE                  0.407710
OWN_CAR_AGE                     65.985253
OCCUPATION_TYPE                 31.276219
CNT_FAM_MEMBERS                  0.000813
EXT_SOURCE_1                    56.310364
EXT_SOURCE_2                     0.209749
EXT_SOURCE_3                    19.907483
APARTMENTS_AVG                  50.715424
BASEMENTAREA_AVG                58.485903
YEARS_BEGINEXPLUATATION_AVG     48.752480
YEARS_BUILD_AVG                 66.479139
COMMONAREA_AVG                  69.847322
ELEVATORS_AVG                   53.250707
ENTRANCES_AVG                   50.298771
FLOORSMAX_AVG                   49.723993
FLOORSMIN_AVG                   67.831534
LANDAREA_AVG                    59.359452
LIVINGAPARTMENTS_AVG            68.351842
LIVINGAREA_AVG                  50.143085
NONLIVINGAPARTMENTS_AVG         69.427823
NONLIVINGAREA_AVG               55.127069
APARTMENTS_MODE                 50.715424
BASEMENTAREA_MODE               58.485903
YEARS_BEGINEXPLUATATION_MODE    48.752480
YEARS_BUILD_MODE                66.479139
COMMONAREA_MODE                 69.847322
ELEVATORS_MODE                  53.250707
ENTRANCES_MODE                  50.298771
FLOORSMAX_MODE                  49.723993
FLOORSMIN_MODE                  67.831534
LANDAREA_MODE                   59.359452
LIVINGAPARTMENTS_MODE           68.351842
LIVINGAREA_MODE                 50.143085
NONLIVINGAPARTMENTS_MODE        69.427823
NONLIVINGAREA_MODE              55.127069
APARTMENTS_MEDI                 50.715424
BASEMENTAREA_MEDI               58.485903
YEARS_BEGINEXPLUATATION_MEDI    48.752480
YEARS_BUILD_MEDI                66.479139
COMMONAREA_MEDI                 69.847322
ELEVATORS_MEDI                  53.250707
ENTRANCES_MEDI                  50.298771
FLOORSMAX_MEDI                  49.723993
FLOORSMIN_MEDI                  67.831534
LANDAREA_MEDI                   59.359452
LIVINGAPARTMENTS_MEDI           68.351842
LIVINGAREA_MEDI                 50.143085
NONLIVINGAPARTMENTS_MEDI        69.427823
NONLIVINGAREA_MEDI              55.127069
FONDKAPREMONT_MODE              68.368102
HOUSETYPE_MODE                  50.147556
TOTALAREA_MODE                  48.227293
WALLSMATERIAL_MODE              50.813388
EMERGENCYSTATE_MODE             47.361468
AMT_REQ_CREDIT_BUREAU_HOUR      13.573136
AMT_REQ_CREDIT_BUREAU_DAY       13.573136
AMT_REQ_CREDIT_BUREAU_WEEK      13.573136
AMT_REQ_CREDIT_BUREAU_MON       13.573136
AMT_REQ_CREDIT_BUREAU_QRT       13.573136
AMT_REQ_CREDIT_BUREAU_YEAR      13.573136
dtype: float64
                           index          0
0                COMMONAREA_MODE  69.847322
1                COMMONAREA_MEDI  69.847322
2                 COMMONAREA_AVG  69.847322
3       NONLIVINGAPARTMENTS_MODE  69.427823
4        NONLIVINGAPARTMENTS_AVG  69.427823
5       NONLIVINGAPARTMENTS_MEDI  69.427823
6             FONDKAPREMONT_MODE  68.368102
7           LIVINGAPARTMENTS_AVG  68.351842
8          LIVINGAPARTMENTS_MODE  68.351842
9          LIVINGAPARTMENTS_MEDI  68.351842
10                FLOORSMIN_MEDI  67.831534
11                FLOORSMIN_MODE  67.831534
12                 FLOORSMIN_AVG  67.831534
13              YEARS_BUILD_MODE  66.479139
14              YEARS_BUILD_MEDI  66.479139
15               YEARS_BUILD_AVG  66.479139
16                   OWN_CAR_AGE  65.985253
17                 LANDAREA_MEDI  59.359452
18                 LANDAREA_MODE  59.359452
19                  LANDAREA_AVG  59.359452
20             BASEMENTAREA_MEDI  58.485903
21              BASEMENTAREA_AVG  58.485903
22             BASEMENTAREA_MODE  58.485903
23                  EXT_SOURCE_1  56.310364
24            NONLIVINGAREA_MODE  55.127069
25             NONLIVINGAREA_AVG  55.127069
26            NONLIVINGAREA_MEDI  55.127069
27                 ELEVATORS_AVG  53.250707
28                ELEVATORS_MEDI  53.250707
29                ELEVATORS_MODE  53.250707
30            WALLSMATERIAL_MODE  50.813388
31                APARTMENTS_AVG  50.715424
32               APARTMENTS_MODE  50.715424
33               APARTMENTS_MEDI  50.715424
34                ENTRANCES_MODE  50.298771
35                ENTRANCES_MEDI  50.298771
36                 ENTRANCES_AVG  50.298771
37                HOUSETYPE_MODE  50.147556
38               LIVINGAREA_MEDI  50.143085
39               LIVINGAREA_MODE  50.143085
40                LIVINGAREA_AVG  50.143085
41                FLOORSMAX_MEDI  49.723993
42                FLOORSMAX_MODE  49.723993
43                 FLOORSMAX_AVG  49.723993
44   YEARS_BEGINEXPLUATATION_AVG  48.752480
45  YEARS_BEGINEXPLUATATION_MEDI  48.752480
46  YEARS_BEGINEXPLUATATION_MODE  48.752480
47                TOTALAREA_MODE  48.227293
48           EMERGENCYSTATE_MODE  47.361468
49               OCCUPATION_TYPE  31.276219
50                  EXT_SOURCE_3  19.907483
51     AMT_REQ_CREDIT_BUREAU_DAY  13.573136
52    AMT_REQ_CREDIT_BUREAU_WEEK  13.573136
53     AMT_REQ_CREDIT_BUREAU_MON  13.573136
54     AMT_REQ_CREDIT_BUREAU_QRT  13.573136
55    AMT_REQ_CREDIT_BUREAU_HOUR  13.573136
56    AMT_REQ_CREDIT_BUREAU_YEAR  13.573136
57               NAME_TYPE_SUITE   0.407710
58                  EXT_SOURCE_2   0.209749
59               AMT_GOODS_PRICE   0.085770
60                   AMT_ANNUITY   0.004471
61                   CODE_GENDER   0.001219
62               CNT_FAM_MEMBERS   0.000813
In [44]:
eda.plot_missing_values(data)
No description has been provided for this image
In [45]:
#Valores faltantes por fila y con variable objetivo para cada fila 
missing_row_sorted = eda.check_missing_per_row(data_train)

#Visualizacion del dataframe :
missing_row_sorted
Out[45]:
missing_percentage TARGET
236260 50.000000 0
249616 50.000000 1
224619 49.152542 0
267335 49.152542 0
26398 49.152542 0
... ... ...
33699 0.000000 0
239409 0.000000 0
221021 0.000000 0
167597 0.000000 0
183048 0.000000 0

246008 rows × 2 columns

VISUALIZATION OF CONTINUOUS AND CATEGORICAL VARIABLES¶

In [47]:
#llamo a la funcion de mi archivo de funciones: 
#con esta funcion, pusé en una lista las categoricas: categort, object, y numericos con menos de 20 valores unicos 

#y en una otra lista las variables continuas que son tipo float o int, con mas de 20 valores unicos 


categorical_vars, continuous_vars = eda.dame_variables_categoricas(data_train)

print("Variables categóricas:", categorical_vars)
print("----------------------------------------------------------------")
print("Variables continuas:", continuous_vars)
Variables categóricas: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_QRT', 'TARGET']
----------------------------------------------------------------
Variables continuas: ['SK_ID_CURR', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'HOUR_APPR_PROCESS_START', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_YEAR']
In [19]:
#creo carpeta de imagenes dedicada a la EDA
os.makedirs('../images/02_notebook_images', exist_ok=True)

#llamo mi funcion para visualizar las categoricas, y respeto a la variable TARGET:
eda.plot_all_variables(data_train, categorical_vars, 'TARGET')
Analyzing NAME_CONTRACT_TYPE
No description has been provided for this image
Analyzing CODE_GENDER
No description has been provided for this image
Analyzing FLAG_OWN_CAR
No description has been provided for this image
Analyzing FLAG_OWN_REALTY
No description has been provided for this image
Analyzing CNT_CHILDREN
No description has been provided for this image
Analyzing NAME_TYPE_SUITE
No description has been provided for this image
Analyzing NAME_INCOME_TYPE
No description has been provided for this image
Analyzing NAME_EDUCATION_TYPE
No description has been provided for this image
Analyzing NAME_FAMILY_STATUS
No description has been provided for this image
Analyzing NAME_HOUSING_TYPE
No description has been provided for this image
Analyzing FLAG_MOBIL
No description has been provided for this image
Analyzing FLAG_EMP_PHONE
No description has been provided for this image
Analyzing FLAG_WORK_PHONE
No description has been provided for this image
Analyzing FLAG_CONT_MOBILE
No description has been provided for this image
Analyzing FLAG_PHONE
No description has been provided for this image
Analyzing FLAG_EMAIL
No description has been provided for this image
Analyzing OCCUPATION_TYPE
No description has been provided for this image
Analyzing CNT_FAM_MEMBERS
No description has been provided for this image
Analyzing REGION_RATING_CLIENT
No description has been provided for this image
Analyzing REGION_RATING_CLIENT_W_CITY
No description has been provided for this image
Analyzing WEEKDAY_APPR_PROCESS_START
No description has been provided for this image
Analyzing REG_REGION_NOT_LIVE_REGION
No description has been provided for this image
Analyzing REG_REGION_NOT_WORK_REGION
No description has been provided for this image
Analyzing LIVE_REGION_NOT_WORK_REGION
No description has been provided for this image
Analyzing REG_CITY_NOT_LIVE_CITY
No description has been provided for this image
Analyzing REG_CITY_NOT_WORK_CITY
No description has been provided for this image
Analyzing LIVE_CITY_NOT_WORK_CITY
No description has been provided for this image
Analyzing ORGANIZATION_TYPE
No description has been provided for this image
Analyzing FONDKAPREMONT_MODE
No description has been provided for this image
Analyzing HOUSETYPE_MODE
No description has been provided for this image
Analyzing WALLSMATERIAL_MODE
No description has been provided for this image
Analyzing EMERGENCYSTATE_MODE
No description has been provided for this image
Analyzing FLAG_DOCUMENT_2
No description has been provided for this image
Analyzing FLAG_DOCUMENT_3
No description has been provided for this image
Analyzing FLAG_DOCUMENT_4
No description has been provided for this image
Analyzing FLAG_DOCUMENT_5
No description has been provided for this image
Analyzing FLAG_DOCUMENT_6
No description has been provided for this image
Analyzing FLAG_DOCUMENT_7
No description has been provided for this image
Analyzing FLAG_DOCUMENT_8
No description has been provided for this image
Analyzing FLAG_DOCUMENT_9
No description has been provided for this image
Analyzing FLAG_DOCUMENT_10
No description has been provided for this image
Analyzing FLAG_DOCUMENT_11
No description has been provided for this image
Analyzing FLAG_DOCUMENT_12
No description has been provided for this image
Analyzing FLAG_DOCUMENT_13
No description has been provided for this image
Analyzing FLAG_DOCUMENT_14
No description has been provided for this image
Analyzing FLAG_DOCUMENT_15
No description has been provided for this image
Analyzing FLAG_DOCUMENT_16
No description has been provided for this image
Analyzing FLAG_DOCUMENT_17
No description has been provided for this image
Analyzing FLAG_DOCUMENT_18
No description has been provided for this image
Analyzing FLAG_DOCUMENT_19
No description has been provided for this image
Analyzing FLAG_DOCUMENT_20
No description has been provided for this image
Analyzing FLAG_DOCUMENT_21
No description has been provided for this image
Analyzing AMT_REQ_CREDIT_BUREAU_HOUR
No description has been provided for this image
Analyzing AMT_REQ_CREDIT_BUREAU_DAY
No description has been provided for this image
Analyzing AMT_REQ_CREDIT_BUREAU_WEEK
No description has been provided for this image
Analyzing AMT_REQ_CREDIT_BUREAU_QRT
No description has been provided for this image
Analyzing TARGET
No description has been provided for this image
In [25]:
# llamo la funcion que grafica, guarda las imagenes de mis variables continuas:
eda.plot_all_features(data_train, continuous_vars, target_col='TARGET')
Iniciando análisis de 61 características continuas

Procesando lote 1 de 21
No description has been provided for this image
Gráfico de 'SK_ID_CURR' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'AMT_INCOME_TOTAL' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'AMT_CREDIT' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 2 de 21
No description has been provided for this image
Gráfico de 'AMT_ANNUITY' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'AMT_GOODS_PRICE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'REGION_POPULATION_RELATIVE' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 3 de 21
No description has been provided for this image
Gráfico de 'DAYS_BIRTH' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'DAYS_EMPLOYED' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'DAYS_REGISTRATION' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 4 de 21
No description has been provided for this image
Gráfico de 'DAYS_ID_PUBLISH' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'OWN_CAR_AGE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'HOUR_APPR_PROCESS_START' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 5 de 21
No description has been provided for this image
Gráfico de 'EXT_SOURCE_1' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'EXT_SOURCE_2' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'EXT_SOURCE_3' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 6 de 21
No description has been provided for this image
Gráfico de 'APARTMENTS_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'BASEMENTAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'YEARS_BEGINEXPLUATATION_AVG' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 7 de 21
No description has been provided for this image
Gráfico de 'YEARS_BUILD_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'COMMONAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'ELEVATORS_AVG' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 8 de 21
No description has been provided for this image
Gráfico de 'ENTRANCES_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'FLOORSMAX_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'FLOORSMIN_AVG' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 9 de 21
No description has been provided for this image
Gráfico de 'LANDAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'LIVINGAPARTMENTS_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'LIVINGAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 10 de 21
No description has been provided for this image
Gráfico de 'NONLIVINGAPARTMENTS_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'NONLIVINGAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'APARTMENTS_MODE' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 11 de 21
No description has been provided for this image
Gráfico de 'BASEMENTAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'YEARS_BEGINEXPLUATATION_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'YEARS_BUILD_MODE' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 12 de 21
No description has been provided for this image
Gráfico de 'COMMONAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'ELEVATORS_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'ENTRANCES_MODE' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 13 de 21
No description has been provided for this image
Gráfico de 'FLOORSMAX_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'FLOORSMIN_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'LANDAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 14 de 21
No description has been provided for this image
Gráfico de 'LIVINGAPARTMENTS_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'LIVINGAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'NONLIVINGAPARTMENTS_MODE' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 15 de 21
No description has been provided for this image
Gráfico de 'NONLIVINGAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'APARTMENTS_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'BASEMENTAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 16 de 21
No description has been provided for this image
Gráfico de 'YEARS_BEGINEXPLUATATION_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'YEARS_BUILD_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'COMMONAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 17 de 21
No description has been provided for this image
Gráfico de 'ELEVATORS_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'ENTRANCES_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'FLOORSMAX_MEDI' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 18 de 21
No description has been provided for this image
Gráfico de 'FLOORSMIN_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'LANDAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'LIVINGAPARTMENTS_MEDI' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 19 de 21
No description has been provided for this image
Gráfico de 'LIVINGAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'NONLIVINGAPARTMENTS_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'NONLIVINGAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 20 de 21
No description has been provided for this image
Gráfico de 'TOTALAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'DAYS_LAST_PHONE_CHANGE' guardado exitosamente en '..\images\02_notebook_images'.
No description has been provided for this image
Gráfico de 'AMT_REQ_CREDIT_BUREAU_MON' guardado exitosamente en '..\images\02_notebook_images'.

Procesando lote 21 de 21
No description has been provided for this image
Gráfico de 'AMT_REQ_CREDIT_BUREAU_YEAR' guardado exitosamente en '..\images\02_notebook_images'.

Proceso completado exitosamente.

Conclusión¶

Para concluir:

Vamos a destacar las tendencias observadas en la distribución de las variables en relación con TARGET para identificar aquellas que, a simple vista, parecen tener mayor influencia en la clasificación.

Factores laborales¶

  • ORGANIZATION_TYPE muestra que ciertos tipos de trabajos, como limpieza, electricidad y cultura, están más asociados con la clase 1, lo que sugiere una relación entre el tipo de ámbito laboral y la propensión a retrasos.
  • OCCUPATION_TYPE refuerza esta tendencia, destacando que los trabajos no calificados, como camareros, conductores, cocineros y personal de seguridad, tienen una mayor proporción en la clase 1.

Situación socioeconómica¶

  • En NAME_INCOME_TYPE, los desempleados y quienes están en permiso por maternidad tienen una mayor probabilidad de retraso (clase 1), reflejando factores socioeconómicos que notamos en la vida real.
  • Los propietarios de una vivienda y un coche tienden a pertenecer más a la clase 0, aunque esta diferencia no es significativa.

Factores demográficos¶

  • En cuanto al género, los hombres están más representados en la clase 1, mientras que las mujeres predominan en la clase 0.
  • La educación parece ser crucial: un mayor nivel educativo está asociado con la clase 0, mientras que niveles bajos se relacionan con la clase 1.
  • La edad también influye: los clientes más mayores tienden a estar en la clase 0, mientras que los más jóvenes son más propensos a retrasos (clase 1).

Características del préstamo¶

  • Los préstamos revolving están mayormente asociados con la clase 0, mientras que los préstamos cash predominan en la clase 1. Esto podría deberse a que los préstamos revolving reflejan una mejor gestión financiera.
  • Un crédito más alto parece estar vinculado a la clase 0, posiblemente debido a filtros más estrictos para aprobar montos elevados.
  • Aunque se esperaría que las annuities más altas se relacionen con la clase 1, los datos muestran que están más presentes en la clase 0, con algunos outliers en la clase 1.

Factores familiares¶

  • El número de ninos impacta significativamente: a medida que aumenta, también lo hace la proporción de clientes en la clase 1. significando una mayor probabilidad de retraso.
  • Sin embargo, no se observan patrones claros relacionados con el estado civil segun si el cliente que pidio el prestamo estaba casado o no.

Factores externos¶

  • Las clasificaciones externas (no tenemos insights sobre como se hizo la clasificacion), como el rating de región y rating de cliente, indican que regiones y clientes de tipo 3 tienen mayor probabilidad de pertenecer a la clase 1. Sugeriendo que mas alto el rating, mas riesgos surgen

Conclusion para la variable INCOME¶

  • El ingreso del cliente es una variable que considero imprescindible,

Se observa una tendencia central más alta en la clase 0, que presenta una mediana de ingresos superior. La dispersión es prácticamente igual, dado que el rango intercuartil es bastante similar en ambas clases. Aunque hay una ligera diferencia de valores extremos en la clase 0 (significando que gente con mas ingresos tiene mas probabilidad de ser de clase 0), no es lo suficientemente significativa como para concluir que las personas con retraso tienen una diferencia considerable en los ingresos en comparación con otros casos.

A continuación, se tratan los valores missing, las correlaciones de las variables continuas y los outliers¶

In [53]:
continuous_vars
Out[53]:
['SK_ID_CURR',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'HOUR_APPR_PROCESS_START',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'TOTALAREA_MODE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_YEAR']

OUTLIERS¶

In [55]:
#identificar los valores extremos para variables continuas 
# basándose en un rango definido por un intervalo de confianza multiplicado 
# por una desviación estándar
eda.get_deviation_of_mean_perc(data_train, continuous_vars, target='TARGET', multiplier=3)
Out[55]:
0.0 1.0 variable sum_outlier_values porcentaje_sum_null_values
0 0.938095 0.061905 AMT_INCOME_TOTAL 210 0.000854
1 0.958988 0.041012 AMT_CREDIT 2609 0.010605
2 0.960951 0.039049 AMT_ANNUITY 2356 0.009577
3 0.959916 0.040084 AMT_GOODS_PRICE 3318 0.013487
4 0.959733 0.040267 REGION_POPULATION_RELATIVE 6730 0.027357
5 0.958403 0.041597 DAYS_REGISTRATION 601 0.002443
6 0.916479 0.083521 OWN_CAR_AGE 2670 0.010853
7 0.897638 0.102362 HOUR_APPR_PROCESS_START 508 0.002065
8 0.950358 0.049642 APARTMENTS_AVG 2377 0.009662
9 0.949077 0.050923 BASEMENTAREA_AVG 1571 0.006386
10 0.917431 0.082569 YEARS_BEGINEXPLUATATION_AVG 545 0.002215
11 0.921466 0.078534 YEARS_BUILD_AVG 955 0.003882
12 0.944767 0.055233 COMMONAREA_AVG 1376 0.005593
13 0.953775 0.046225 ELEVATORS_AVG 1947 0.007914
14 0.938833 0.061167 ENTRANCES_AVG 1782 0.007244
15 0.957355 0.042645 FLOORSMAX_AVG 2087 0.008483
16 0.970588 0.029412 FLOORSMIN_AVG 476 0.001935
17 0.935252 0.064748 LANDAREA_AVG 1668 0.006780
18 0.948313 0.051687 LIVINGAPARTMENTS_AVG 1393 0.005662
19 0.947904 0.052096 LIVINGAREA_AVG 2553 0.010378
20 0.932743 0.067257 NONLIVINGAPARTMENTS_AVG 565 0.002297
21 0.948454 0.051546 NONLIVINGAREA_AVG 1940 0.007886
22 0.948505 0.051495 APARTMENTS_MODE 2408 0.009788
23 0.944745 0.055255 BASEMENTAREA_MODE 1665 0.006768
24 0.915414 0.084586 YEARS_BEGINEXPLUATATION_MODE 532 0.002163
25 0.922441 0.077559 YEARS_BUILD_MODE 967 0.003931
26 0.939839 0.060161 COMMONAREA_MODE 1363 0.005540
27 0.949329 0.050671 ELEVATORS_MODE 2684 0.010910
28 0.938244 0.061756 ENTRANCES_MODE 1765 0.007175
29 0.958768 0.041232 FLOORSMAX_MODE 2110 0.008577
30 0.971429 0.028571 FLOORSMIN_MODE 385 0.001565
31 0.935972 0.064028 LANDAREA_MODE 1718 0.006984
32 0.944095 0.055905 LIVINGAPARTMENTS_MODE 1431 0.005817
33 0.945673 0.054327 LIVINGAREA_MODE 2669 0.010849
34 0.926554 0.073446 NONLIVINGAPARTMENTS_MODE 531 0.002158
35 0.949058 0.050942 NONLIVINGAREA_MODE 1963 0.007979
36 0.949979 0.050021 APARTMENTS_MEDI 2419 0.009833
37 0.947937 0.052063 BASEMENTAREA_MEDI 1575 0.006402
38 0.913386 0.086614 YEARS_BEGINEXPLUATATION_MEDI 508 0.002065
39 0.922280 0.077720 YEARS_BUILD_MEDI 965 0.003923
40 0.943844 0.056156 COMMONAREA_MEDI 1389 0.005646
41 0.953981 0.046019 ELEVATORS_MEDI 1934 0.007862
42 0.938582 0.061418 ENTRANCES_MEDI 1791 0.007280
43 0.957515 0.042485 FLOORSMAX_MEDI 2189 0.008898
44 0.970852 0.029148 FLOORSMIN_MEDI 446 0.001813
45 0.938596 0.061404 LANDAREA_MEDI 1710 0.006951
46 0.947820 0.052180 LIVINGAPARTMENTS_MEDI 1399 0.005687
47 0.948968 0.051032 LIVINGAREA_MEDI 2567 0.010435
48 0.930728 0.069272 NONLIVINGAPARTMENTS_MEDI 563 0.002289
49 0.948692 0.051308 NONLIVINGAREA_MEDI 1949 0.007923
50 0.956044 0.043956 TOTALAREA_MODE 2639 0.010727
51 0.965974 0.034026 DAYS_LAST_PHONE_CHANGE 529 0.002150
52 0.948478 0.051522 AMT_REQ_CREDIT_BUREAU_MON 2562 0.010414
53 0.911004 0.088996 AMT_REQ_CREDIT_BUREAU_YEAR 2708 0.011008

El método de la desviación estándar es un método clásico que he utilizado y que parece adecuado para la distribución de mis datos. Me parece confiable, no decidi explorar métodos alternativos para obtener una mayor precisión.

Para concluir el analisis de valores extremos continuos:¶

Una proporción tan baja de valores atípicos globalmente significa que su impacto global en la distribución probablemente sea limitado.
Además, en variables importantes como el ingreso (income), el número de valores atípicos es extremadamente bajo.
No es indispensable tratarlos de momento, pero quedo atenta a la eleccion del modelo para ver si se debe ajustar la precision y si puede tener una importancia adaptarlos si el modelo lo pide.

Se podria utilizar métodos como capping, la imputación con la mediana o la media, las transformaciones matemáticas (logaritmo, raíz cuadrada), o removarlos...

CORRELATION CATEGORICAL AND NUMERICAL SEPARATELY¶

  • MATRIZ Y COEF. DE CORRELACION
In [62]:
eda.get_corr_matrix(dataset = data_train[continuous_vars], 
                metodo='pearson', size_figure=[10,8])
No description has been provided for this image
Out[62]:
0
In [63]:
corr = data_train[continuous_vars].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.55]
Out[63]:
level_0 level_1 correlation
2824 YEARS_BUILD_MEDI YEARS_BUILD_AVG 0.998419
3134 FLOORSMIN_MEDI FLOORSMIN_AVG 0.997352
3072 FLOORSMAX_MEDI FLOORSMAX_AVG 0.997141
3010 ENTRANCES_MEDI ENTRANCES_AVG 0.996948
2948 ELEVATORS_MEDI ELEVATORS_AVG 0.996008
2886 COMMONAREA_MEDI COMMONAREA_AVG 0.995816
3320 LIVINGAREA_MEDI LIVINGAREA_AVG 0.995450
2638 APARTMENTS_MEDI APARTMENTS_AVG 0.995270
3258 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.994466
2700 BASEMENTAREA_MEDI BASEMENTAREA_AVG 0.994035
2762 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_AVG 0.993125
3444 NONLIVINGAREA_MEDI NONLIVINGAREA_AVG 0.991649
3196 LANDAREA_MEDI LANDAREA_AVG 0.991599
3382 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_AVG 0.990498
1970 YEARS_BUILD_MODE YEARS_BUILD_AVG 0.989127
2838 YEARS_BUILD_MEDI YEARS_BUILD_MODE 0.989118
3148 FLOORSMIN_MEDI FLOORSMIN_MODE 0.988433
3086 FLOORSMAX_MEDI FLOORSMAX_MODE 0.988204
246 AMT_GOODS_PRICE AMT_CREDIT 0.986997
2280 FLOORSMIN_MODE FLOORSMIN_AVG 0.986046
2218 FLOORSMAX_MODE FLOORSMAX_AVG 0.985710
2962 ELEVATORS_MEDI ELEVATORS_MODE 0.982707
3210 LANDAREA_MEDI LANDAREA_MODE 0.980788
3024 ENTRANCES_MEDI ENTRANCES_MODE 0.980273
2900 COMMONAREA_MEDI COMMONAREA_MODE 0.979032
2094 ELEVATORS_MODE ELEVATORS_AVG 0.978604
2156 ENTRANCES_MODE ENTRANCES_AVG 0.977388
2652 APARTMENTS_MEDI APARTMENTS_MODE 0.977085
3396 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_MODE 0.977053
2714 BASEMENTAREA_MEDI BASEMENTAREA_MODE 0.976871
2032 COMMONAREA_MODE COMMONAREA_AVG 0.976245
3272 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.975746
3458 NONLIVINGAREA_MEDI NONLIVINGAREA_MODE 0.975426
3334 LIVINGAREA_MEDI LIVINGAREA_MODE 0.974965
1784 APARTMENTS_MODE APARTMENTS_AVG 0.973294
2342 LANDAREA_MODE LANDAREA_AVG 0.973156
2466 LIVINGAREA_MODE LIVINGAREA_AVG 0.972183
1846 BASEMENTAREA_MODE BASEMENTAREA_AVG 0.972080
2404 LIVINGAPARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.970693
1908 YEARS_BEGINEXPLUATATION_MODE YEARS_BEGINEXPLUATATION_AVG 0.970327
2528 NONLIVINGAPARTMENTS_MODE NONLIVINGAPARTMENTS_AVG 0.967621
2590 NONLIVINGAREA_MODE NONLIVINGAREA_AVG 0.967063
2776 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_MODE 0.960662
1540 LIVINGAPARTMENTS_AVG APARTMENTS_AVG 0.944185
3248 LIVINGAPARTMENTS_MEDI APARTMENTS_AVG 0.942998
3276 LIVINGAPARTMENTS_MEDI APARTMENTS_MEDI 0.942776
2408 LIVINGAPARTMENTS_MODE APARTMENTS_MODE 0.939285
2648 APARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.936461
2662 APARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.933201
2394 LIVINGAPARTMENTS_MODE APARTMENTS_AVG 0.931936
3503 TOTALAREA_MODE LIVINGAREA_AVG 0.925681
3531 TOTALAREA_MODE LIVINGAREA_MEDI 0.920420
3337 LIVINGAREA_MEDI APARTMENTS_MEDI 0.917647
3262 LIVINGAPARTMENTS_MEDI APARTMENTS_MODE 0.915566
1601 LIVINGAREA_AVG APARTMENTS_AVG 0.915396
3309 LIVINGAREA_MEDI APARTMENTS_AVG 0.914270
2649 APARTMENTS_MEDI LIVINGAREA_AVG 0.914240
2469 LIVINGAREA_MODE APARTMENTS_MODE 0.912045
1794 APARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.909813
3517 TOTALAREA_MODE LIVINGAREA_MODE 0.900182
2663 APARTMENTS_MEDI LIVINGAREA_MODE 0.897528
3323 LIVINGAREA_MEDI APARTMENTS_MODE 0.895802
2455 LIVINGAREA_MODE APARTMENTS_AVG 0.895123
3492 TOTALAREA_MODE APARTMENTS_AVG 0.893126
1795 APARTMENTS_MODE LIVINGAREA_AVG 0.892349
3520 TOTALAREA_MODE APARTMENTS_MEDI 0.887156
3347 LIVINGAREA_MEDI LIVINGAPARTMENTS_MEDI 0.884883
3259 LIVINGAPARTMENTS_MEDI LIVINGAREA_AVG 0.883318
1611 LIVINGAREA_AVG LIVINGAPARTMENTS_AVG 0.881072
2479 LIVINGAREA_MODE LIVINGAPARTMENTS_MODE 0.879649
3319 LIVINGAREA_MEDI LIVINGAPARTMENTS_AVG 0.879001
3333 LIVINGAREA_MEDI LIVINGAPARTMENTS_MODE 0.874785
2405 LIVINGAPARTMENTS_MODE LIVINGAREA_AVG 0.873256
3342 LIVINGAREA_MEDI ELEVATORS_MEDI 0.869387
1606 LIVINGAREA_AVG ELEVATORS_AVG 0.868331
2954 ELEVATORS_MEDI LIVINGAREA_AVG 0.866723
3314 LIVINGAREA_MEDI ELEVATORS_AVG 0.866426
3506 TOTALAREA_MODE APARTMENTS_MODE 0.864362
3273 LIVINGAPARTMENTS_MEDI LIVINGAREA_MODE 0.858390
2474 LIVINGAREA_MODE ELEVATORS_MODE 0.856916
3328 LIVINGAREA_MEDI ELEVATORS_MODE 0.856738
2100 ELEVATORS_MODE LIVINGAREA_AVG 0.853372
2465 LIVINGAREA_MODE LIVINGAPARTMENTS_AVG 0.853118
3502 TOTALAREA_MODE LIVINGAPARTMENTS_AVG 0.849229
3530 TOTALAREA_MODE LIVINGAPARTMENTS_MEDI 0.847481
3497 TOTALAREA_MODE ELEVATORS_AVG 0.843365
2968 ELEVATORS_MEDI LIVINGAREA_MODE 0.841639
2460 LIVINGAREA_MODE ELEVATORS_AVG 0.839344
3525 TOTALAREA_MODE ELEVATORS_MEDI 0.837275
2971 ELEVATORS_MEDI APARTMENTS_MEDI 0.836541
1235 ELEVATORS_AVG APARTMENTS_AVG 0.836042
3516 TOTALAREA_MODE LIVINGAPARTMENTS_MODE 0.835947
2943 ELEVATORS_MEDI APARTMENTS_AVG 0.834482
2643 APARTMENTS_MEDI ELEVATORS_AVG 0.833536
2103 ELEVATORS_MODE APARTMENTS_MODE 0.825541
2657 APARTMENTS_MEDI ELEVATORS_MODE 0.824718
2089 ELEVATORS_MODE APARTMENTS_AVG 0.821834
3511 TOTALAREA_MODE ELEVATORS_MODE 0.820205
3281 LIVINGAPARTMENTS_MEDI ELEVATORS_MEDI 0.812438
3253 LIVINGAPARTMENTS_MEDI ELEVATORS_AVG 0.810733
1545 LIVINGAPARTMENTS_AVG ELEVATORS_AVG 0.810014
2953 ELEVATORS_MEDI LIVINGAPARTMENTS_AVG 0.807994
2957 ELEVATORS_MEDI APARTMENTS_MODE 0.807919
2413 LIVINGAPARTMENTS_MODE ELEVATORS_MODE 0.806637
1789 APARTMENTS_MODE ELEVATORS_AVG 0.804933
3267 LIVINGAPARTMENTS_MEDI ELEVATORS_MODE 0.798001
2967 ELEVATORS_MEDI LIVINGAPARTMENTS_MODE 0.797162
2399 LIVINGAPARTMENTS_MODE ELEVATORS_AVG 0.795324
2099 ELEVATORS_MODE LIVINGAPARTMENTS_AVG 0.793141
247 AMT_GOODS_PRICE AMT_ANNUITY 0.775701
185 AMT_ANNUITY AMT_CREDIT 0.770691
1425 FLOORSMIN_AVG FLOORSMAX_AVG 0.742447
3161 FLOORSMIN_MEDI FLOORSMAX_MEDI 0.740578
3133 FLOORSMIN_MEDI FLOORSMAX_AVG 0.740053
3073 FLOORSMAX_MEDI FLOORSMIN_AVG 0.740023
3147 FLOORSMIN_MEDI FLOORSMAX_MODE 0.730022
2219 FLOORSMAX_MODE FLOORSMIN_AVG 0.729350
2293 FLOORSMIN_MODE FLOORSMAX_MODE 0.727067
3087 FLOORSMAX_MEDI FLOORSMIN_MODE 0.723708
2279 FLOORSMIN_MODE FLOORSMAX_AVG 0.722966
1602 LIVINGAREA_AVG BASEMENTAREA_AVG 0.695154
2710 BASEMENTAREA_MEDI LIVINGAREA_AVG 0.695129
3338 LIVINGAREA_MEDI BASEMENTAREA_MEDI 0.694073
2470 LIVINGAREA_MODE BASEMENTAREA_MODE 0.693588
3310 LIVINGAREA_MEDI BASEMENTAREA_AVG 0.691838
2724 BASEMENTAREA_MEDI LIVINGAREA_MODE 0.683185
2727 BASEMENTAREA_MEDI APARTMENTS_MEDI 0.682731
991 BASEMENTAREA_AVG APARTMENTS_AVG 0.681712
2699 BASEMENTAREA_MEDI APARTMENTS_AVG 0.681551
1859 BASEMENTAREA_MODE APARTMENTS_MODE 0.681342
2639 APARTMENTS_MEDI BASEMENTAREA_AVG 0.680320
1362 FLOORSMAX_AVG ELEVATORS_AVG 0.680080
2456 LIVINGAREA_MODE BASEMENTAREA_AVG 0.679499
3070 FLOORSMAX_MEDI ELEVATORS_AVG 0.677748
1856 BASEMENTAREA_MODE LIVINGAREA_AVG 0.676864
2950 ELEVATORS_MEDI FLOORSMAX_AVG 0.676434
3324 LIVINGAREA_MEDI BASEMENTAREA_MODE 0.676385
3098 FLOORSMAX_MEDI ELEVATORS_MEDI 0.675868
3493 TOTALAREA_MODE BASEMENTAREA_AVG 0.673406
2216 FLOORSMAX_MODE ELEVATORS_AVG 0.670997
3521 TOTALAREA_MODE BASEMENTAREA_MEDI 0.670901
2713 BASEMENTAREA_MEDI APARTMENTS_MODE 0.670827
2964 ELEVATORS_MEDI FLOORSMAX_MODE 0.669300
1785 APARTMENTS_MODE BASEMENTAREA_AVG 0.667797
2653 APARTMENTS_MEDI BASEMENTAREA_MODE 0.666497
1845 BASEMENTAREA_MODE APARTMENTS_AVG 0.664205
2230 FLOORSMAX_MODE ELEVATORS_MODE 0.661472
2409 LIVINGAPARTMENTS_MODE BASEMENTAREA_MODE 0.657269
2165 ENTRANCES_MODE BASEMENTAREA_MODE 0.656745
2096 ELEVATORS_MODE FLOORSMAX_AVG 0.656743
3084 FLOORSMAX_MEDI ELEVATORS_MODE 0.656085
1851 BASEMENTAREA_MODE ENTRANCES_AVG 0.655926
2705 BASEMENTAREA_MEDI ENTRANCES_AVG 0.655063
3019 ENTRANCES_MEDI BASEMENTAREA_MODE 0.654844
2723 BASEMENTAREA_MEDI LIVINGAPARTMENTS_MODE 0.653997
3277 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MEDI 0.653879
3033 ENTRANCES_MEDI BASEMENTAREA_MEDI 0.653525
1297 ENTRANCES_AVG BASEMENTAREA_AVG 0.652845
3249 LIVINGAPARTMENTS_MEDI BASEMENTAREA_AVG 0.651098
3507 TOTALAREA_MODE BASEMENTAREA_MODE 0.650905
2709 BASEMENTAREA_MEDI LIVINGAPARTMENTS_AVG 0.649699
2395 LIVINGAPARTMENTS_MODE BASEMENTAREA_AVG 0.649393
1541 LIVINGAPARTMENTS_AVG BASEMENTAREA_AVG 0.649292
3005 ENTRANCES_MEDI BASEMENTAREA_AVG 0.648754
2719 BASEMENTAREA_MEDI ENTRANCES_MODE 0.633048
3263 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MODE 0.632473
3499 TOTALAREA_MODE FLOORSMAX_AVG 0.632121
1608 LIVINGAREA_AVG FLOORSMAX_AVG 0.631623
3076 FLOORSMAX_MEDI LIVINGAREA_AVG 0.629591
3527 TOTALAREA_MODE FLOORSMAX_MEDI 0.629423
3316 LIVINGAREA_MEDI FLOORSMAX_AVG 0.628424
1855 BASEMENTAREA_MODE LIVINGAPARTMENTS_AVG 0.628040
2151 ENTRANCES_MODE BASEMENTAREA_AVG 0.627817
3344 LIVINGAREA_MEDI FLOORSMAX_MEDI 0.627624
2222 FLOORSMAX_MODE LIVINGAREA_AVG 0.627241
3330 LIVINGAREA_MEDI FLOORSMAX_MODE 0.625625
3513 TOTALAREA_MODE FLOORSMAX_MODE 0.624915
2475 LIVINGAREA_MODE ENTRANCES_MODE 0.621397
2461 LIVINGAREA_MODE ENTRANCES_AVG 0.621177
3029 ENTRANCES_MEDI LIVINGAREA_MODE 0.620609
1357 FLOORSMAX_AVG APARTMENTS_AVG 0.618742
3315 LIVINGAREA_MEDI ENTRANCES_AVG 0.618092
3343 LIVINGAREA_MEDI ENTRANCES_MEDI 0.617806
1607 LIVINGAREA_AVG ENTRANCES_AVG 0.617606
3065 FLOORSMAX_MEDI APARTMENTS_AVG 0.616645
433 DAYS_EMPLOYED DAYS_BIRTH 0.615939
2645 APARTMENTS_MEDI FLOORSMAX_AVG 0.615532
2164 ENTRANCES_MODE APARTMENTS_MODE 0.614903
3093 FLOORSMAX_MEDI APARTMENTS_MEDI 0.614634
2211 FLOORSMAX_MODE APARTMENTS_AVG 0.614561
3015 ENTRANCES_MEDI LIVINGAREA_AVG 0.613990
2659 APARTMENTS_MEDI FLOORSMAX_MODE 0.612911
3018 ENTRANCES_MEDI APARTMENTS_MODE 0.611534
1790 APARTMENTS_MODE ENTRANCES_AVG 0.611277
2644 APARTMENTS_MEDI ENTRANCES_AVG 0.610779
1296 ENTRANCES_AVG APARTMENTS_AVG 0.610692
3032 ENTRANCES_MEDI APARTMENTS_MEDI 0.610665
2476 LIVINGAREA_MODE FLOORSMAX_MODE 0.607347
3004 ENTRANCES_MEDI APARTMENTS_AVG 0.607231
738 EXT_SOURCE_1 DAYS_BIRTH 0.600492
2462 LIVINGAREA_MODE FLOORSMAX_AVG 0.598360
3090 FLOORSMAX_MEDI LIVINGAREA_MODE 0.597595
2225 FLOORSMAX_MODE APARTMENTS_MODE 0.596504
3498 TOTALAREA_MODE ENTRANCES_AVG 0.593474
3329 LIVINGAREA_MEDI ENTRANCES_MODE 0.593359
1547 LIVINGAPARTMENTS_AVG FLOORSMAX_AVG 0.589843
3255 LIVINGAPARTMENTS_MEDI FLOORSMAX_AVG 0.589165
2161 ENTRANCES_MODE LIVINGAREA_AVG 0.588945
3526 TOTALAREA_MODE ENTRANCES_MEDI 0.587157
2658 APARTMENTS_MEDI ENTRANCES_MODE 0.586964
3283 LIVINGAPARTMENTS_MEDI FLOORSMAX_MEDI 0.586745
3075 FLOORSMAX_MEDI LIVINGAPARTMENTS_AVG 0.586583
1791 APARTMENTS_MODE FLOORSMAX_AVG 0.586504
3079 FLOORSMAX_MEDI APARTMENTS_MODE 0.585628
3269 LIVINGAPARTMENTS_MEDI FLOORSMAX_MODE 0.583228
2221 FLOORSMAX_MODE LIVINGAPARTMENTS_AVG 0.582849
2150 ENTRANCES_MODE APARTMENTS_AVG 0.582479
2400 LIVINGAPARTMENTS_MODE ENTRANCES_AVG 0.573465
2415 LIVINGAPARTMENTS_MODE FLOORSMAX_MODE 0.572674
3028 ENTRANCES_MEDI LIVINGAPARTMENTS_MODE 0.572349
2401 LIVINGAPARTMENTS_MODE FLOORSMAX_AVG 0.568441
3089 FLOORSMAX_MEDI LIVINGAPARTMENTS_MODE 0.566405
3254 LIVINGAPARTMENTS_MEDI ENTRANCES_AVG 0.566080
2414 LIVINGAPARTMENTS_MODE ENTRANCES_MODE 0.565401
1236 ELEVATORS_AVG BASEMENTAREA_AVG 0.564902
3282 LIVINGAPARTMENTS_MEDI ENTRANCES_MEDI 0.564775
1546 LIVINGAPARTMENTS_AVG ENTRANCES_AVG 0.563805
2972 ELEVATORS_MEDI BASEMENTAREA_MEDI 0.563126
2704 BASEMENTAREA_MEDI ELEVATORS_AVG 0.563083
2944 ELEVATORS_MEDI BASEMENTAREA_AVG 0.562466
3014 ENTRANCES_MEDI LIVINGAPARTMENTS_AVG 0.559816
3512 TOTALAREA_MODE ENTRANCES_MODE 0.559791
2718 BASEMENTAREA_MEDI ELEVATORS_MODE 0.556170
2090 ELEVATORS_MODE BASEMENTAREA_AVG 0.555015
2104 ELEVATORS_MODE BASEMENTAREA_MODE 0.554090
3496 TOTALAREA_MODE COMMONAREA_AVG 0.553295
3524 TOTALAREA_MODE COMMONAREA_MEDI 0.552846
3341 LIVINGAREA_MEDI COMMONAREA_MEDI 0.550937
2893 COMMONAREA_MEDI LIVINGAREA_AVG 0.550281

Conclusion :¶

Si dos variables están fuertemente correlacionadas, esto puede indicar multicolinealidad, donde las variables predicen esencialmente la misma información.

Aqui vemos en orden las variables mas correladas entre si, se nota que en la mayoria de los casos son variables de 'tipo de vivienda' que aportan detalles sobre el hogar, lo que tiene sentido porque son variables relacionadas entre si y muy detalladas, que mas o menos describen con detalle la vivienda.

  • Otras variables relacionadas son AMT_ANNUITY y AMT_CREDIT, y AMT_ANNUITY y AMT_GOODS_PRICE que van relacionando por proporcionalidad, porque AMT_ANNUITY es proporcional a AMT_CREDIT, ya que una cuota mensual más alta está típicamente asociada a un monto de crédito más alto.

Se podria, segun del modelo, considerar removar algunas, o juntarles en una nueva variable para reducir el numero de variables y la multicolinealidad. Se puede tambien considerar usar el PCA...

MISSING VALUES¶

In [66]:
eda.get_percent_null_values_target(data_train, continuous_vars, target='TARGET')
Out[66]:
Category_0 variable sum_null_values porcentaje_sum_null_values Category_1
0 1.000000 AMT_ANNUITY 11 0.000045 NaN
1 0.919431 AMT_GOODS_PRICE 211 0.000858 0.080569
2 0.914944 OWN_CAR_AGE 162329 0.659853 0.085056
3 0.914595 EXT_SOURCE_1 138528 0.563104 0.085405
4 0.914729 EXT_SOURCE_2 516 0.002097 0.085271
5 0.907155 EXT_SOURCE_3 48974 0.199075 0.092845
6 0.908756 APARTMENTS_AVG 124764 0.507154 0.091244
7 0.911016 BASEMENTAREA_AVG 143880 0.584859 0.088984
8 0.908192 YEARS_BEGINEXPLUATATION_AVG 119935 0.487525 0.091808
9 0.913332 YEARS_BUILD_AVG 163544 0.664791 0.086668
10 0.914398 COMMONAREA_AVG 171830 0.698473 0.085602
11 0.909222 ELEVATORS_AVG 131001 0.532507 0.090778
12 0.908444 ENTRANCES_AVG 123739 0.502988 0.091556
13 0.908375 FLOORSMAX_AVG 122325 0.497240 0.091625
14 0.913850 FLOORSMIN_AVG 166871 0.678315 0.086150
15 0.911949 LANDAREA_AVG 146029 0.593595 0.088051
16 0.913899 LIVINGAPARTMENTS_AVG 168151 0.683518 0.086101
17 0.908792 LIVINGAREA_AVG 123356 0.501431 0.091208
18 0.914308 NONLIVINGAPARTMENTS_AVG 170798 0.694278 0.085692
19 0.909864 NONLIVINGAREA_AVG 135617 0.551271 0.090136
20 0.908756 APARTMENTS_MODE 124764 0.507154 0.091244
21 0.911016 BASEMENTAREA_MODE 143880 0.584859 0.088984
22 0.908192 YEARS_BEGINEXPLUATATION_MODE 119935 0.487525 0.091808
23 0.913332 YEARS_BUILD_MODE 163544 0.664791 0.086668
24 0.914398 COMMONAREA_MODE 171830 0.698473 0.085602
25 0.909222 ELEVATORS_MODE 131001 0.532507 0.090778
26 0.908444 ENTRANCES_MODE 123739 0.502988 0.091556
27 0.908375 FLOORSMAX_MODE 122325 0.497240 0.091625
28 0.913850 FLOORSMIN_MODE 166871 0.678315 0.086150
29 0.911949 LANDAREA_MODE 146029 0.593595 0.088051
30 0.913899 LIVINGAPARTMENTS_MODE 168151 0.683518 0.086101
31 0.908792 LIVINGAREA_MODE 123356 0.501431 0.091208
32 0.914308 NONLIVINGAPARTMENTS_MODE 170798 0.694278 0.085692
33 0.909864 NONLIVINGAREA_MODE 135617 0.551271 0.090136
34 0.908756 APARTMENTS_MEDI 124764 0.507154 0.091244
35 0.911016 BASEMENTAREA_MEDI 143880 0.584859 0.088984
36 0.908192 YEARS_BEGINEXPLUATATION_MEDI 119935 0.487525 0.091808
37 0.913332 YEARS_BUILD_MEDI 163544 0.664791 0.086668
38 0.914398 COMMONAREA_MEDI 171830 0.698473 0.085602
39 0.909222 ELEVATORS_MEDI 131001 0.532507 0.090778
40 0.908444 ENTRANCES_MEDI 123739 0.502988 0.091556
41 0.908375 FLOORSMAX_MEDI 122325 0.497240 0.091625
42 0.913850 FLOORSMIN_MEDI 166871 0.678315 0.086150
43 0.911949 LANDAREA_MEDI 146029 0.593595 0.088051
44 0.913899 LIVINGAPARTMENTS_MEDI 168151 0.683518 0.086101
45 0.908792 LIVINGAREA_MEDI 123356 0.501431 0.091208
46 0.914308 NONLIVINGAPARTMENTS_MEDI 170798 0.694278 0.085692
47 0.909864 NONLIVINGAREA_MEDI 135617 0.551271 0.090136
48 0.907951 TOTALAREA_MODE 118643 0.482273 0.092049
49 0.896709 AMT_REQ_CREDIT_BUREAU_MON 33391 0.135731 0.103291
50 0.896709 AMT_REQ_CREDIT_BUREAU_YEAR 33391 0.135731 0.103291

Si una variable importante tiene muchos valores faltantes, puede significar que el cliente no entregó un documento necesario, lo que podría ser senal de mayor riesgo. También podría ser que esa información no era obligatoria al solicitar el préstamo. En cualquier caso, pueden dar pistas sobre el comportamiento o riesgo del cliente.

QUE ESTRATREGIA DECIDI SEGUIR ?¶

En algunos campos se debe elegir particularmente¶

Decidimos agrupar variables de vivienda, y variables mas especificas para imputar de dos maneras diferentes:

Utilizamos dos métodos para tratar los valores faltantes según las características de los datos.

  • La imputación con la media la aplico a las variables relacionadas con la vivienda y la descripción de los hogares, ya que estas variables son relativamente homogéneas y independientes de otros factores, lo que hace que este método sea simple y efectivo.

  • Por otro lado, usé el SimpleImputer con la strategy de mediana, para las columnas continuas que quedaban como las de AMT_ANNUITY, OWN_CAR_AGE y EXT_SOURCE_1...

Ya que es una técnica robusta con valores atípicos, que son mas o menos comunes en estas variables. La mediana garantiza una imputación mas rápida y razonable, evitando distorsiones debidas a valores extremos.

Además, permite un flujo consistente y escalable entre conjuntos de entrenamiento y prueba, asegurando que las mismas imputaciones se apliquen de forma uniforme. Si se necesita mayor precisión o se identifican relaciones complejas entre variables, podría implementarse un KNearestNeighbors para predecir mejor valores missing con sus dependencias asociadas a sus 'vecinos'os.

In [68]:
# copio los datos originales
data_train_input = data_train.copy()
data_test_input = data_test.copy()

#alisto mis columnas continuas relacionadas con la vivienda cuales nulos cambiaré con la media:
housing_continues_columns = [
    'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 
    'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 
    'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 
    'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 
    'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 
    'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 
    'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 
    'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 
    'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 
    'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 
    'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE'
]

#IMPUTANDO CON LA MEDIA EN LOS DOS CONJUNTOS DE MI DATA:
data_train_input[housing_continues_columns] = data_train[housing_continues_columns].fillna(data_train[housing_continues_columns].mean())
data_test_input[housing_continues_columns] = data_test[housing_continues_columns].fillna(data_test[housing_continues_columns].mean())

#Otras columnas que pondré con una imputacion con mediana:
remaining_continues_columns = [
    'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'OWN_CAR_AGE', 'EXT_SOURCE_1', 
    'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_YEAR'
]

#escalado de los datos (bloques para mas eficiencia)
scaler = StandardScaler()

def scale_and_impute(data, columns, scaler, imputer):
    scaled_data = scaler.fit_transform(data[columns]) 
    return imputer.fit_transform(scaled_data)          

#USO EL METODO DE LA MEDIANA:
simple_imputer = SimpleImputer(strategy="median")

#imputar con escalado para mas facilidad
data_train_imputed = scale_and_impute(data_train_input, remaining_continues_columns, scaler, simple_imputer)
data_test_imputed = scale_and_impute(data_test_input, remaining_continues_columns, scaler, simple_imputer)

#dataframe completos
data_train_imputed_df = pd.DataFrame(data_train_imputed, columns=remaining_continues_columns, index=data_train_input.index)
data_test_imputed_df = pd.DataFrame(data_test_imputed, columns=remaining_continues_columns, index=data_test_input.index)

#asignar nuevos datos al dataframe que he copiado en primer lugar
data_train_input[remaining_continues_columns] = data_train_imputed_df
data_test_input[remaining_continues_columns] = data_test_imputed_df

#COMPROBACION

print("\nValores nulos después de la imputación en train (remaining_continues_columns):")
print(data_train_input[remaining_continues_columns].isnull().sum())

print("\nValores nulos después de la imputación en test (remaining_continues_columns):")
print(data_test_input[remaining_continues_columns].isnull().sum())
Valores nulos después de la imputación en train (remaining_continues_columns):
AMT_ANNUITY                   0
AMT_GOODS_PRICE               0
OWN_CAR_AGE                   0
EXT_SOURCE_1                  0
EXT_SOURCE_2                  0
EXT_SOURCE_3                  0
AMT_REQ_CREDIT_BUREAU_MON     0
AMT_REQ_CREDIT_BUREAU_YEAR    0
dtype: int64

Valores nulos después de la imputación en test (remaining_continues_columns):
AMT_ANNUITY                   0
AMT_GOODS_PRICE               0
OWN_CAR_AGE                   0
EXT_SOURCE_1                  0
EXT_SOURCE_2                  0
EXT_SOURCE_3                  0
AMT_REQ_CREDIT_BUREAU_MON     0
AMT_REQ_CREDIT_BUREAU_YEAR    0
dtype: int64

Variables categoricas¶

In [70]:
print("Cantidad de valores nulos en 'TARGET':", data_train['TARGET'].isnull().sum())
Cantidad de valores nulos en 'TARGET': 0
In [71]:
categorical_vars
Out[71]:
['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'OCCUPATION_TYPE',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'WEEKDAY_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'TARGET']
In [72]:
#conteos antes de tratar los datos categoricos :

print("missing de categoricas antes de transformar:")
print(data_train_input[categorical_vars].isnull().sum())


print("\nTipo de cada variable:")
print(data_train_input[categorical_vars].dtypes)


print("\nValores unicos de cada variable:")
print(data_train_input[categorical_vars].nunique())
missing de categoricas antes de transformar:
NAME_CONTRACT_TYPE                  0
CODE_GENDER                         3
FLAG_OWN_CAR                        0
FLAG_OWN_REALTY                     0
CNT_CHILDREN                        0
NAME_TYPE_SUITE                  1003
NAME_INCOME_TYPE                    0
NAME_EDUCATION_TYPE                 0
NAME_FAMILY_STATUS                  0
NAME_HOUSING_TYPE                   0
FLAG_MOBIL                          0
FLAG_EMP_PHONE                      0
FLAG_WORK_PHONE                     0
FLAG_CONT_MOBILE                    0
FLAG_PHONE                          0
FLAG_EMAIL                          0
OCCUPATION_TYPE                 76942
CNT_FAM_MEMBERS                     2
REGION_RATING_CLIENT                0
REGION_RATING_CLIENT_W_CITY         0
WEEKDAY_APPR_PROCESS_START          0
REG_REGION_NOT_LIVE_REGION          0
REG_REGION_NOT_WORK_REGION          0
LIVE_REGION_NOT_WORK_REGION         0
REG_CITY_NOT_LIVE_CITY              0
REG_CITY_NOT_WORK_CITY              0
LIVE_CITY_NOT_WORK_CITY             0
ORGANIZATION_TYPE                   0
FONDKAPREMONT_MODE             168191
HOUSETYPE_MODE                 123367
WALLSMATERIAL_MODE             125005
EMERGENCYSTATE_MODE            116513
FLAG_DOCUMENT_2                     0
FLAG_DOCUMENT_3                     0
FLAG_DOCUMENT_4                     0
FLAG_DOCUMENT_5                     0
FLAG_DOCUMENT_6                     0
FLAG_DOCUMENT_7                     0
FLAG_DOCUMENT_8                     0
FLAG_DOCUMENT_9                     0
FLAG_DOCUMENT_10                    0
FLAG_DOCUMENT_11                    0
FLAG_DOCUMENT_12                    0
FLAG_DOCUMENT_13                    0
FLAG_DOCUMENT_14                    0
FLAG_DOCUMENT_15                    0
FLAG_DOCUMENT_16                    0
FLAG_DOCUMENT_17                    0
FLAG_DOCUMENT_18                    0
FLAG_DOCUMENT_19                    0
FLAG_DOCUMENT_20                    0
FLAG_DOCUMENT_21                    0
AMT_REQ_CREDIT_BUREAU_HOUR      33391
AMT_REQ_CREDIT_BUREAU_DAY       33391
AMT_REQ_CREDIT_BUREAU_WEEK      33391
AMT_REQ_CREDIT_BUREAU_QRT       33391
TARGET                              0
dtype: int64

Tipo de cada variable:
NAME_CONTRACT_TYPE              object
CODE_GENDER                     object
FLAG_OWN_CAR                    object
FLAG_OWN_REALTY                 object
CNT_CHILDREN                     int64
NAME_TYPE_SUITE                 object
NAME_INCOME_TYPE                object
NAME_EDUCATION_TYPE             object
NAME_FAMILY_STATUS              object
NAME_HOUSING_TYPE               object
FLAG_MOBIL                       int64
FLAG_EMP_PHONE                   int64
FLAG_WORK_PHONE                  int64
FLAG_CONT_MOBILE                 int64
FLAG_PHONE                       int64
FLAG_EMAIL                       int64
OCCUPATION_TYPE                 object
CNT_FAM_MEMBERS                float64
REGION_RATING_CLIENT             int64
REGION_RATING_CLIENT_W_CITY      int64
WEEKDAY_APPR_PROCESS_START      object
REG_REGION_NOT_LIVE_REGION       int64
REG_REGION_NOT_WORK_REGION       int64
LIVE_REGION_NOT_WORK_REGION      int64
REG_CITY_NOT_LIVE_CITY           int64
REG_CITY_NOT_WORK_CITY           int64
LIVE_CITY_NOT_WORK_CITY          int64
ORGANIZATION_TYPE               object
FONDKAPREMONT_MODE              object
HOUSETYPE_MODE                  object
WALLSMATERIAL_MODE              object
EMERGENCYSTATE_MODE             object
FLAG_DOCUMENT_2                  int64
FLAG_DOCUMENT_3                  int64
FLAG_DOCUMENT_4                  int64
FLAG_DOCUMENT_5                  int64
FLAG_DOCUMENT_6                  int64
FLAG_DOCUMENT_7                  int64
FLAG_DOCUMENT_8                  int64
FLAG_DOCUMENT_9                  int64
FLAG_DOCUMENT_10                 int64
FLAG_DOCUMENT_11                 int64
FLAG_DOCUMENT_12                 int64
FLAG_DOCUMENT_13                 int64
FLAG_DOCUMENT_14                 int64
FLAG_DOCUMENT_15                 int64
FLAG_DOCUMENT_16                 int64
FLAG_DOCUMENT_17                 int64
FLAG_DOCUMENT_18                 int64
FLAG_DOCUMENT_19                 int64
FLAG_DOCUMENT_20                 int64
FLAG_DOCUMENT_21                 int64
AMT_REQ_CREDIT_BUREAU_HOUR     float64
AMT_REQ_CREDIT_BUREAU_DAY      float64
AMT_REQ_CREDIT_BUREAU_WEEK     float64
AMT_REQ_CREDIT_BUREAU_QRT      float64
TARGET                           int64
dtype: object

Valores unicos de cada variable:
NAME_CONTRACT_TYPE              2
CODE_GENDER                     2
FLAG_OWN_CAR                    2
FLAG_OWN_REALTY                 2
CNT_CHILDREN                   15
NAME_TYPE_SUITE                 7
NAME_INCOME_TYPE                8
NAME_EDUCATION_TYPE             5
NAME_FAMILY_STATUS              6
NAME_HOUSING_TYPE               6
FLAG_MOBIL                      1
FLAG_EMP_PHONE                  2
FLAG_WORK_PHONE                 2
FLAG_CONT_MOBILE                2
FLAG_PHONE                      2
FLAG_EMAIL                      2
OCCUPATION_TYPE                18
CNT_FAM_MEMBERS                17
REGION_RATING_CLIENT            3
REGION_RATING_CLIENT_W_CITY     3
WEEKDAY_APPR_PROCESS_START      7
REG_REGION_NOT_LIVE_REGION      2
REG_REGION_NOT_WORK_REGION      2
LIVE_REGION_NOT_WORK_REGION     2
REG_CITY_NOT_LIVE_CITY          2
REG_CITY_NOT_WORK_CITY          2
LIVE_CITY_NOT_WORK_CITY         2
ORGANIZATION_TYPE              58
FONDKAPREMONT_MODE              4
HOUSETYPE_MODE                  3
WALLSMATERIAL_MODE              7
EMERGENCYSTATE_MODE             2
FLAG_DOCUMENT_2                 2
FLAG_DOCUMENT_3                 2
FLAG_DOCUMENT_4                 2
FLAG_DOCUMENT_5                 2
FLAG_DOCUMENT_6                 2
FLAG_DOCUMENT_7                 2
FLAG_DOCUMENT_8                 2
FLAG_DOCUMENT_9                 2
FLAG_DOCUMENT_10                2
FLAG_DOCUMENT_11                2
FLAG_DOCUMENT_12                2
FLAG_DOCUMENT_13                2
FLAG_DOCUMENT_14                2
FLAG_DOCUMENT_15                2
FLAG_DOCUMENT_16                2
FLAG_DOCUMENT_17                2
FLAG_DOCUMENT_18                2
FLAG_DOCUMENT_19                2
FLAG_DOCUMENT_20                2
FLAG_DOCUMENT_21                2
AMT_REQ_CREDIT_BUREAU_HOUR      5
AMT_REQ_CREDIT_BUREAU_DAY       9
AMT_REQ_CREDIT_BUREAU_WEEK      9
AMT_REQ_CREDIT_BUREAU_QRT      10
TARGET                          2
dtype: int64
In [73]:
#bucle para tener las correlaciones categoricas de cada variable categorica respeto a la variable objetivo:
for var in categorical_vars:
    print('------------------------------------------------------------')
    print(f"Confusion matrix parfa {var} con respeto a TARGET:")
    confusion_matrix = pd.crosstab(data_train['TARGET'], data_train_input[var])
    print(confusion_matrix)
    #coeficiente de Cramers v:
    cramers_v_value = eda.cramers_v(confusion_matrix.values)
    print(f"Cramér's V for {var}: {cramers_v_value}\n")
------------------------------------------------------------
Confusion matrix parfa NAME_CONTRACT_TYPE con respeto a TARGET:
NAME_CONTRACT_TYPE  CASH LOANS  REVOLVING LOANS
TARGET                                         
0                       204106            22042
1                        18590             1270
Cramér's V for NAME_CONTRACT_TYPE: 0.0310865205023682

------------------------------------------------------------
Confusion matrix parfa CODE_GENDER con respeto a TARGET:
CODE_GENDER  FEMALE   MALE
TARGET                    
0            150573  75572
1             11307   8553
Cramér's V for CODE_GENDER: 0.05535981073628126

------------------------------------------------------------
Confusion matrix parfa FLAG_OWN_CAR con respeto a TARGET:
FLAG_OWN_CAR      NO    YES
TARGET                     
0             148517  77631
1              13807   6053
Cramér's V for FLAG_OWN_CAR: 0.022025230924775974

------------------------------------------------------------
Confusion matrix parfa FLAG_OWN_REALTY con respeto a TARGET:
FLAG_OWN_REALTY     NO     YES
TARGET                        
0                69132  157016
1                 6341   13519
Cramér's V for FLAG_OWN_REALTY: 0.007754761693103105

------------------------------------------------------------
Confusion matrix parfa CNT_CHILDREN con respeto a TARGET:
CNT_CHILDREN      0      1      2     3    4   5   6   7   8   9   10  11  12  \
TARGET                                                                          
0             159018  44541  19528  2674  302  60  11   5   2   0   1   0   2   
1              13325   4327   1861   287   46   6   5   0   0   2   0   1   0   

CNT_CHILDREN  14  19  
TARGET                
0              2   2  
1              0   0  
Cramér's V for CNT_CHILDREN: 0.023388575925143276

------------------------------------------------------------
Confusion matrix parfa NAME_TYPE_SUITE con respeto a TARGET:
NAME_TYPE_SUITE  CHILDREN  FAMILY  GROUP OF PEOPLE  OTHER_A  OTHER_B  \
TARGET                                                                 
0                    2468   29724              200      637     1266   
1                     193    2399               17       62      140   

NAME_TYPE_SUITE  SPOUSE, PARTNER  UNACCOMPANIED  
TARGET                                           
0                           8286         182618  
1                            713          16282  
Cramér's V for NAME_TYPE_SUITE: 0.009732772442277339

------------------------------------------------------------
Confusion matrix parfa NAME_INCOME_TYPE con respeto a TARGET:
NAME_INCOME_TYPE  BUSINESSMAN  COMMERCIAL ASSOCIATE  MATERNITY LEAVE  \
TARGET                                                                 
0                          10                 53077                3   
1                           0                  4324                1   

NAME_INCOME_TYPE  PENSIONER  STATE SERVANT  STUDENT  UNEMPLOYED  WORKING  
TARGET                                                                    
0                     41765          16381       13          13   114886  
1                      2367           1025        0           5    12138  
Cramér's V for NAME_INCOME_TYPE: 0.062250216935764706

------------------------------------------------------------
Confusion matrix parfa NAME_EDUCATION_TYPE con respeto a TARGET:
NAME_EDUCATION_TYPE  ACADEMIC DEGREE  HIGHER EDUCATION  INCOMPLETE HIGHER  \
TARGET                                                                      
0                                127             56774               7539   
1                                  2              3192                695   

NAME_EDUCATION_TYPE  LOWER SECONDARY  SECONDARY / SECONDARY SPECIAL  
TARGET                                                               
0                               2665                         159043  
1                                338                          15633  
Cramér's V for NAME_EDUCATION_TYPE: 0.058359656764498065

------------------------------------------------------------
Confusion matrix parfa NAME_FAMILY_STATUS con respeto a TARGET:
NAME_FAMILY_STATUS  CIVIL MARRIAGE  MARRIED  SEPARATED  SINGLE / NOT MARRIED  \
TARGET                                                                         
0                            21364   145299      14534                 32785   
1                             2375    11842       1329                  3557   

NAME_FAMILY_STATUS  UNKNOWN  WIDOW  
TARGET                              
0                         2  12164  
1                         0    757  
Cramér's V for NAME_FAMILY_STATUS: 0.04067121943031738

------------------------------------------------------------
Confusion matrix parfa NAME_HOUSING_TYPE con respeto a TARGET:
NAME_HOUSING_TYPE  CO-OP APARTMENT  HOUSE / APARTMENT  MUNICIPAL APARTMENT  \
TARGET                                                                       
0                              834             201252                 8228   
1                               72              17008                  753   

NAME_HOUSING_TYPE  OFFICE APARTMENT  RENTED APARTMENT  WITH PARENTS  
TARGET                                                               
0                              1950              3460         10424  
1                               137               484          1406  
Cramér's V for NAME_HOUSING_TYPE: 0.03777646043528446

------------------------------------------------------------
Confusion matrix parfa FLAG_MOBIL con respeto a TARGET:
FLAG_MOBIL       1
TARGET            
0           226148
1            19860
Cramér's V for FLAG_MOBIL: nan

------------------------------------------------------------
Confusion matrix parfa FLAG_EMP_PHONE con respeto a TARGET:
FLAG_EMP_PHONE      0       1
TARGET                       
0               41778  184370
1                2373   17487
Cramér's V for FLAG_EMP_PHONE: 0.04625807891255162

------------------------------------------------------------
Confusion matrix parfa FLAG_WORK_PHONE con respeto a TARGET:
C:\Users\ninao\Desktop\ML_practica1EDA\notebooks\../src\eda_utils.py:549: RuntimeWarning: invalid value encountered in double_scalars
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
FLAG_WORK_PHONE       0      1
TARGET                        
0                181826  44322
1                 15095   4765
Cramér's V for FLAG_WORK_PHONE: 0.02986670896835815

------------------------------------------------------------
Confusion matrix parfa FLAG_CONT_MOBILE con respeto a TARGET:
FLAG_CONT_MOBILE    0       1
TARGET                       
0                 427  225721
1                  36   19824
Cramér's V for FLAG_CONT_MOBILE: 0.0

------------------------------------------------------------
Confusion matrix parfa FLAG_PHONE con respeto a TARGET:
FLAG_PHONE       0      1
TARGET                   
0           161863  64285
1            14939   4921
Cramér's V for FLAG_PHONE: 0.021990638240084764

------------------------------------------------------------
Confusion matrix parfa FLAG_EMAIL con respeto a TARGET:
FLAG_EMAIL       0      1
TARGET                   
0           213165  12983
1            18773   1087
Cramér's V for FLAG_EMAIL: 0.0023645650771151965

------------------------------------------------------------
Confusion matrix parfa OCCUPATION_TYPE con respeto a TARGET:
OCCUPATION_TYPE  ACCOUNTANTS  CLEANING STAFF  COOKING STAFF  CORE STAFF  \
TARGET                                                                    
0                       7514            3362           4276       20671   
1                        379             372            507        1410   

OCCUPATION_TYPE  DRIVERS  HIGH SKILL TECH STAFF  HR STAFF  IT STAFF  LABORERS  \
TARGET                                                                          
0                  13195                   8584       436       389     39540   
1                   1699                    560        28        32      4652   

OCCUPATION_TYPE  LOW-SKILL LABORERS  MANAGERS  MEDICINE STAFF  \
TARGET                                                          
0                              1383     16048            6342   
1                               296      1092             454   

OCCUPATION_TYPE  PRIVATE SERVICE STAFF  REALTY AGENTS  SALES STAFF  \
TARGET                                                               
0                                 1992            549        23201   
1                                  136             44         2446   

OCCUPATION_TYPE  SECRETARIES  SECURITY STAFF  WAITERS/BARMEN STAFF  
TARGET                                                              
0                        993            4751                   959  
1                         74             585                   115  
Cramér's V for OCCUPATION_TYPE: 0.08084491419477519

------------------------------------------------------------
Confusion matrix parfa CNT_FAM_MEMBERS con respeto a TARGET:
CNT_FAM_MEMBERS   1.0     2.0    3.0    4.0   5.0   6.0   7.0   8.0   9.0   \
TARGET                                                                       
0                49807  117020  38380  18067  2503   288    57    10     5   
1                 4573    9611   3652   1702   262    47     5     5     0   

CNT_FAM_MEMBERS  10.0  11.0  12.0  13.0  14.0  15.0  16.0  20.0  
TARGET                                                           
0                   2     0     1     0     2     1     1     2  
1                   1     1     0     1     0     0     0     0  
Cramér's V for CNT_FAM_MEMBERS: 0.022443597931890227

------------------------------------------------------------
Confusion matrix parfa REGION_RATING_CLIENT con respeto a TARGET:
REGION_RATING_CLIENT      1       2      3
TARGET                                    
0                     24430  167304  34414
1                      1262   14307   4291
Cramér's V for REGION_RATING_CLIENT: 0.05797431452905184

------------------------------------------------------------
Confusion matrix parfa REGION_RATING_CLIENT_W_CITY con respeto a TARGET:
REGION_RATING_CLIENT_W_CITY      1       2      3
TARGET                                           
0                            25941  169054  31153
1                             1349   14489   4022
Cramér's V for REGION_RATING_CLIENT_W_CITY: 0.060540064376819976

------------------------------------------------------------
Confusion matrix parfa WEEKDAY_APPR_PROCESS_START con respeto a TARGET:
WEEKDAY_APPR_PROCESS_START  FRIDAY  MONDAY  SATURDAY  SUNDAY  THURSDAY  \
TARGET                                                                   
0                            36887   37472     24962   11869     37218   
1                             3281    3152      2141    1022      3305   

WEEKDAY_APPR_PROCESS_START  TUESDAY  WEDNESDAY  
TARGET                                          
0                             39534      38206  
1                              3580       3379  
Cramér's V for WEEKDAY_APPR_PROCESS_START: 0.004495454281500501

------------------------------------------------------------
Confusion matrix parfa REG_REGION_NOT_LIVE_REGION con respeto a TARGET:
REG_REGION_NOT_LIVE_REGION       0     1
TARGET                                  
0                           222748  3400
1                            19502   358
Cramér's V for REG_REGION_NOT_LIVE_REGION: 0.0062680653140641555

------------------------------------------------------------
Confusion matrix parfa REG_REGION_NOT_WORK_REGION con respeto a TARGET:
REG_REGION_NOT_WORK_REGION       0      1
TARGET                                   
0                           214743  11405
1                            18745   1115
Cramér's V for REG_REGION_NOT_WORK_REGION: 0.006750820541451695

------------------------------------------------------------
Confusion matrix parfa LIVE_REGION_NOT_WORK_REGION con respeto a TARGET:
LIVE_REGION_NOT_WORK_REGION       0     1
TARGET                                   
0                            216953  9195
1                             19015   845
Cramér's V for LIVE_REGION_NOT_WORK_REGION: 0.0015818927525979585

------------------------------------------------------------
Confusion matrix parfa REG_CITY_NOT_LIVE_CITY con respeto a TARGET:
REG_CITY_NOT_LIVE_CITY       0      1
TARGET                               
0                       209170  16978
1                        17481   2379
Cramér's V for REG_CITY_NOT_LIVE_CITY: 0.04516813958686674

------------------------------------------------------------
Confusion matrix parfa REG_CITY_NOT_WORK_CITY con respeto a TARGET:
REG_CITY_NOT_WORK_CITY       0      1
TARGET                               
0                       175412  50736
1                        13878   5982
Cramér's V for REG_CITY_NOT_WORK_CITY: 0.04965344090426648

------------------------------------------------------------
Confusion matrix parfa LIVE_CITY_NOT_WORK_CITY con respeto a TARGET:
LIVE_CITY_NOT_WORK_CITY       0      1
TARGET                                
0                        186403  39745
1                         15493   4367
Cramér's V for LIVE_CITY_NOT_WORK_CITY: 0.03126228732573633

------------------------------------------------------------
Confusion matrix parfa ORGANIZATION_TYPE con respeto a TARGET:
ORGANIZATION_TYPE  ADVERTISING  AGRICULTURE  BANK  BUSINESS ENTITY TYPE 1  \
TARGET                                                                      
0                          316         1761  1858                    4360   
1                           29          207   102                     394   

ORGANIZATION_TYPE  BUSINESS ENTITY TYPE 2  BUSINESS ENTITY TYPE 3  CLEANING  \
TARGET                                                                        
0                                    7733                   49490       193   
1                                     723                    5107        22   

ORGANIZATION_TYPE  CONSTRUCTION  CULTURE  ELECTRICITY  EMERGENCY  GOVERNMENT  \
TARGET                                                                         
0                          4787      291          716        418        7661   
1                           632       15           50         31         567   

ORGANIZATION_TYPE  HOTEL  HOUSING  INDUSTRY: TYPE 1  INDUSTRY: TYPE 10  \
TARGET                                                                   
0                    738     2196               731                 84   
1                     48      185                92                  3   

ORGANIZATION_TYPE  INDUSTRY: TYPE 11  INDUSTRY: TYPE 12  INDUSTRY: TYPE 13  \
TARGET                                                                       
0                               1959                291                 45   
1                                195                 12                  7   

ORGANIZATION_TYPE  INDUSTRY: TYPE 2  INDUSTRY: TYPE 3  INDUSTRY: TYPE 4  \
TARGET                                                                    
0                               338              2341               631   
1                                21               262                74   

ORGANIZATION_TYPE  INDUSTRY: TYPE 5  INDUSTRY: TYPE 6  INDUSTRY: TYPE 7  \
TARGET                                                                    
0                               457                81               972   
1                                32                 6                80   

ORGANIZATION_TYPE  INDUSTRY: TYPE 8  INDUSTRY: TYPE 9  INSURANCE  \
TARGET                                                             
0                                16              2535        454   
1                                 2               185         25   

ORGANIZATION_TYPE  KINDERGARTEN  LEGAL SERVICES  MEDICINE  MILITARY  MOBILE  \
TARGET                                                                        
0                          5157             229      8385      1974     228   
1                           383              22       590       110      22   

ORGANIZATION_TYPE  OTHER  POLICE  POSTAL  REALTOR  RELIGION  RESTAURANT  \
TARGET                                                                    
0                  12335    1794    1568      285        61        1295   
1                   1030      99     141       36         3         166   

ORGANIZATION_TYPE  SCHOOL  SECURITY  SECURITY MINISTRIES  SELF-EMPLOYED  \
TARGET                                                                    
0                    6684      2305                 1494          27621   
1                     421       273                   68           3105   

ORGANIZATION_TYPE  SERVICES  TELECOM  TRADE: TYPE 1  TRADE: TYPE 2  \
TARGET                                                               
0                      1163      421            243           1419   
1                        80       37             27            104   

ORGANIZATION_TYPE  TRADE: TYPE 3  TRADE: TYPE 4  TRADE: TYPE 5  TRADE: TYPE 6  \
TARGET                                                                          
0                           2561             52             38            464   
1                            295              0              3             19   

ORGANIZATION_TYPE  TRADE: TYPE 7  TRANSPORT: TYPE 1  TRANSPORT: TYPE 2  \
TARGET                                                                   
0                           5670                157               1619   
1                            583                  7                140   

ORGANIZATION_TYPE  TRANSPORT: TYPE 3  TRANSPORT: TYPE 4  UNIVERSITY    XNA  
TARGET                                                                      
0                                800               3903        1001  41769  
1                                154                403          59   2372  
Cramér's V for ORGANIZATION_TYPE: 0.07154282732719068

------------------------------------------------------------
Confusion matrix parfa FONDKAPREMONT_MODE con respeto a TARGET:
FONDKAPREMONT_MODE  NOT SPECIFIED  ORG SPEC ACCOUNT  REG OPER ACCOUNT  \
TARGET                                                                  
0                            4183              4231             54982   
1                             348               264              4146   

FONDKAPREMONT_MODE  REG OPER SPEC ACCOUNT  
TARGET                                     
0                                    9040  
1                                     623  
Cramér's V for FONDKAPREMONT_MODE: 0.012850572159249297

------------------------------------------------------------
Confusion matrix parfa HOUSETYPE_MODE con respeto a TARGET:
HOUSETYPE_MODE  BLOCK OF FLATS  SPECIFIC HOUSING  TERRACED HOUSE
TARGET                                                          
0                       112116              1056             869
1                         8389               125              86
Cramér's V for HOUSETYPE_MODE: 0.014947668199276329

------------------------------------------------------------
Confusion matrix parfa WALLSMATERIAL_MODE con respeto a TARGET:
WALLSMATERIAL_MODE  BLOCK  MIXED  MONOLITHIC  OTHERS  PANEL  STONE, BRICK  \
TARGET                                                                      
0                    6902   1659        1355    1188  49517         48030   
1                     526    140          65     111   3368          3849   

WALLSMATERIAL_MODE  WOODEN  
TARGET                      
0                     3875  
1                      418  
Cramér's V for WALLSMATERIAL_MODE: 0.03001729551410803

------------------------------------------------------------
Confusion matrix parfa EMERGENCYSTATE_MODE con respeto a TARGET:
EMERGENCYSTATE_MODE      NO   YES
TARGET                           
0                    118703  1699
1                      8913   180
Cramér's V for EMERGENCYSTATE_MODE: 0.01169416294243119

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_2 con respeto a TARGET:
FLAG_DOCUMENT_2       0  1
TARGET                    
0                226139  9
1                 19857  3
Cramér's V for FLAG_DOCUMENT_2: 0.0025764726142198224

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_3 con respeto a TARGET:
FLAG_DOCUMENT_3      0       1
TARGET                        
0                66692  159456
1                 4371   15489
Cramér's V for FLAG_DOCUMENT_3: 0.0449053947444949

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_4 con respeto a TARGET:
FLAG_DOCUMENT_4       0   1
TARGET                     
0                226128  20
1                 19860   0
Cramér's V for FLAG_DOCUMENT_4: 0.0

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_5 con respeto a TARGET:
FLAG_DOCUMENT_5       0     1
TARGET                       
0                222737  3411
1                 19568   292
Cramér's V for FLAG_DOCUMENT_5: 0.0

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_6 con respeto a TARGET:
FLAG_DOCUMENT_6       0      1
TARGET                        
0                205844  20304
1                 18673   1187
Cramér's V for FLAG_DOCUMENT_6: 0.028860072636401262

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_7 con respeto a TARGET:
FLAG_DOCUMENT_7       0   1
TARGET                     
0                226106  42
1                 19858   2
Cramér's V for FLAG_DOCUMENT_7: 0.0

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_8 con respeto a TARGET:
FLAG_DOCUMENT_8       0      1
TARGET                        
0                207586  18562
1                 18396   1464
Cramér's V for FLAG_DOCUMENT_8: 0.008055564379763382

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_9 con respeto a TARGET:
FLAG_DOCUMENT_9       0    1
TARGET                      
0                225254  894
1                 19801   59
Cramér's V for FLAG_DOCUMENT_9: 0.003670711674578854

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_10 con respeto a TARGET:
FLAG_DOCUMENT_10       0  1
TARGET                     
0                 226141  7
1                  19860  0
Cramér's V for FLAG_DOCUMENT_10: 0.0

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_11 con respeto a TARGET:
FLAG_DOCUMENT_11       0    1
TARGET                       
0                 225250  898
1                  19800   60
Cramér's V for FLAG_DOCUMENT_11: 0.003494245356856063

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_12 con respeto a TARGET:
FLAG_DOCUMENT_12       0  1
TARGET                     
0                 226146  2
1                  19860  0
Cramér's V for FLAG_DOCUMENT_12: 0.0

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_13 con respeto a TARGET:
FLAG_DOCUMENT_13       0    1
TARGET                       
0                 225280  868
1                  19832   28
Cramér's V for FLAG_DOCUMENT_13: 0.010668706807718871

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_14 con respeto a TARGET:
FLAG_DOCUMENT_14       0    1
TARGET                       
0                 225463  685
1                  19836   24
Cramér's V for FLAG_DOCUMENT_14: 0.008886516785392559

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_15 con respeto a TARGET:
FLAG_DOCUMENT_15       0    1
TARGET                       
0                 225858  290
1                  19851    9
Cramér's V for FLAG_DOCUMENT_15: 0.0059359632723035675

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_16 con respeto a TARGET:
FLAG_DOCUMENT_16       0     1
TARGET                        
0                 223848  2300
1                  19743   117
Cramér's V for FLAG_DOCUMENT_16: 0.01156868111186073

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_17 con respeto a TARGET:
FLAG_DOCUMENT_17       0   1
TARGET                      
0                 226086  62
1                  19858   2
Cramér's V for FLAG_DOCUMENT_17: 0.0014222099477547469

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_18 con respeto a TARGET:
FLAG_DOCUMENT_18       0     1
TARGET                        
0                 224229  1919
1                  19746   114
Cramér's V for FLAG_DOCUMENT_18: 0.007926583281735216

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_19 con respeto a TARGET:
FLAG_DOCUMENT_19       0    1
TARGET                       
0                 226002  146
1                  19850   10
Cramér's V for FLAG_DOCUMENT_19: 0.0

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_20 con respeto a TARGET:
FLAG_DOCUMENT_20       0    1
TARGET                       
0                 226032  116
1                  19848   12
Cramér's V for FLAG_DOCUMENT_20: 0.0

------------------------------------------------------------
Confusion matrix parfa FLAG_DOCUMENT_21 con respeto a TARGET:
FLAG_DOCUMENT_21       0   1
TARGET                      
0                 226079  69
1                  19849  11
Cramér's V for FLAG_DOCUMENT_21: 0.0026689070623304367

------------------------------------------------------------
Confusion matrix parfa AMT_REQ_CREDIT_BUREAU_HOUR con respeto a TARGET:
AMT_REQ_CREDIT_BUREAU_HOUR     0.0   1.0  2.0  3.0  4.0
TARGET                                                 
0                           195041  1116   42    6    1
1                            16304   101    6    0    0
Cramér's V for AMT_REQ_CREDIT_BUREAU_HOUR: 0.0

------------------------------------------------------------
Confusion matrix parfa AMT_REQ_CREDIT_BUREAU_DAY con respeto a TARGET:
AMT_REQ_CREDIT_BUREAU_DAY     0.0  1.0  2.0  3.0  4.0  5.0  6.0  8.0  9.0
TARGET                                                                   
0                          195139  919   76   35   19    8    7    1    2
1                           16300   97   11    1    2    0    0    0    0
Cramér's V for AMT_REQ_CREDIT_BUREAU_DAY: 0.0034934942279904696

------------------------------------------------------------
Confusion matrix parfa AMT_REQ_CREDIT_BUREAU_WEEK con respeto a TARGET:
AMT_REQ_CREDIT_BUREAU_WEEK     0.0   1.0  2.0  3.0  4.0  5.0  6.0  7.0  8.0
TARGET                                                                     
0                           189909  6052  143   43   28    8   16    2    5
1                            15877   510   16    3    3    1    1    0    0
Cramér's V for AMT_REQ_CREDIT_BUREAU_WEEK: 0.0

------------------------------------------------------------
Confusion matrix parfa AMT_REQ_CREDIT_BUREAU_QRT con respeto a TARGET:
AMT_REQ_CREDIT_BUREAU_QRT    0.0    1.0    2.0   3.0   4.0   5.0   6.0   7.0   \
TARGET                                                                          
0                          158758  25235  10519  1279   345    41    18     5   
1                           13538   1768    952    95    46     3     7     1   

AMT_REQ_CREDIT_BUREAU_QRT  8.0   19.0  
TARGET                                 
0                             6     0  
1                             0     1  
Cramér's V for AMT_REQ_CREDIT_BUREAU_QRT: 0.020681870368418964

------------------------------------------------------------
Confusion matrix parfa TARGET con respeto a TARGET:
TARGET       0      1
TARGET               
0       226148      0
1            0  19860
Cramér's V for TARGET: 0.9999726127135284

En resumen, aunque algunas variables muestran des diferencias en las matrices de confusión, su capacidad para predecir el incumplimiento de pago parece limitada debido a los bajos valores del coeficiente de Cramer.

Las variables más discriminantes entre las clases de TARGET son FLAG_MOBIL, FLAG_EMP_PHONE, FLAG_WORK_PHONE, y variables regionales como REG_REGION_NOT_LIVE_REGION y REG_CITY_NOT_LIVE_CITY. Estas variables presentan grandes diferencias en sus distribuciones y podrían ser indicadores importantes para la predicción de TARGET.

El valor de Cramér's V mide la fuerza de la asociación entre dos variables categóricas, y varía entre 0 y 1. Algunas de las valores importantes obtenidos son

  • ORGANIZATION_TYPE: 0.0715, una asociación bastante moderada (si comparamos con las otras variables) con la variable TARGET.
  • Otras variables (como FLAG_DOCUMENT_2, FLAG_DOCUMENT_10, etc.) tienen valores de Cramér's V cercanos a cero, lo que indica una asociación muy débil o nula.

VALORES NULOS DE CATEGORICAS A TRATAR /¶

  • Decidi coger las variables que a primer vista se pueden rellenar con la moda (el valor mas frecuente); porque a lo mejor son variables que son indispensables para tener un modelo preciso.
  • Los documentos si son faltantes significa que probablemente no han sido entregados asi que decidi rellenar con 0s.
  • Y agrupar las que tienen sentido estar rellenadas con 'sin valor' o 'desconocido'
In [77]:
#mis de columnas categóricas
categorical_vars = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 
    'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 
    'EMERGENCYSTATE_MODE', 'ORGANIZATION_TYPE', 'FLAG_DOCUMENT_2', 
    'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 
    'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 
    'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 
    'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 
    'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 
    'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 
    'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_QRT'
]

#les divido segun la imputacion que decidi seguir:
fill_with_most_frequent = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
    'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
    'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'OCCUPATION_TYPE'
]

fill_with_unknown = [
    'NAME_TYPE_SUITE', 'NAME_HOUSING_TYPE', 'FONDKAPREMONT_MODE', 
    'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 
    'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 
    'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_QRT'
]

fill_with_zero = [
    'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 
    'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 
    'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 
    'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
    'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'
]

#imputar 
#moda
imputer = SimpleImputer(strategy='most_frequent')

data_train_input[fill_with_most_frequent] = imputer.fit_transform(data_train_input[fill_with_most_frequent])
data_test_input[fill_with_most_frequent] = imputer.transform(data_test_input[fill_with_most_frequent])

#unknown
for col in fill_with_unknown:
    if col in data_train_input.columns:
        # Verificar si "Desconocido" ya está como categoría antes de añadirla
        if data_train_input[col].dtype.name == "category" and "Desconocido" not in data_train_input[col].cat.categories:
            data_train_input[col] = data_train_input[col].cat.add_categories(["Desconocido"])
        if data_test_input[col].dtype.name == "category" and "Desconocido" not in data_test_input[col].cat.categories:
            data_test_input[col] = data_test_input[col].cat.add_categories(["Desconocido"])

data_train_input[fill_with_unknown] = data_train_input[fill_with_unknown].fillna("Desconocido")
data_test_input[fill_with_unknown] = data_test_input[fill_with_unknown].fillna("Desconocido")

#0s
data_train_input[fill_with_zero] = data_train_input[fill_with_zero].fillna(0)
data_test_input[fill_with_zero] = data_test_input[fill_with_zero].fillna(0)

# Verificación
print("Valores faltantes después de la imputación en data_train_input:")
print(data_train_input[categorical_vars].isnull().sum())

print("\nValores faltantes después de la imputación en data_test_input:")
print(data_test_input[categorical_vars].isnull().sum())
Valores faltantes después de la imputación en data_train_input:
NAME_CONTRACT_TYPE            0
CODE_GENDER                   0
FLAG_OWN_CAR                  0
FLAG_OWN_REALTY               0
NAME_TYPE_SUITE               0
NAME_INCOME_TYPE              0
NAME_EDUCATION_TYPE           0
NAME_FAMILY_STATUS            0
NAME_HOUSING_TYPE             0
OCCUPATION_TYPE               0
FONDKAPREMONT_MODE            0
HOUSETYPE_MODE                0
WALLSMATERIAL_MODE            0
EMERGENCYSTATE_MODE           0
ORGANIZATION_TYPE             0
FLAG_DOCUMENT_2               0
FLAG_DOCUMENT_3               0
FLAG_DOCUMENT_4               0
FLAG_DOCUMENT_5               0
FLAG_DOCUMENT_6               0
FLAG_DOCUMENT_7               0
FLAG_DOCUMENT_8               0
FLAG_DOCUMENT_9               0
FLAG_DOCUMENT_10              0
FLAG_DOCUMENT_11              0
FLAG_DOCUMENT_12              0
FLAG_DOCUMENT_13              0
FLAG_DOCUMENT_14              0
FLAG_DOCUMENT_15              0
FLAG_DOCUMENT_16              0
FLAG_DOCUMENT_17              0
FLAG_DOCUMENT_18              0
FLAG_DOCUMENT_19              0
FLAG_DOCUMENT_20              0
FLAG_DOCUMENT_21              0
AMT_REQ_CREDIT_BUREAU_HOUR    0
AMT_REQ_CREDIT_BUREAU_DAY     0
AMT_REQ_CREDIT_BUREAU_WEEK    0
AMT_REQ_CREDIT_BUREAU_QRT     0
dtype: int64

Valores faltantes después de la imputación en data_test_input:
NAME_CONTRACT_TYPE            0
CODE_GENDER                   0
FLAG_OWN_CAR                  0
FLAG_OWN_REALTY               0
NAME_TYPE_SUITE               0
NAME_INCOME_TYPE              0
NAME_EDUCATION_TYPE           0
NAME_FAMILY_STATUS            0
NAME_HOUSING_TYPE             0
OCCUPATION_TYPE               0
FONDKAPREMONT_MODE            0
HOUSETYPE_MODE                0
WALLSMATERIAL_MODE            0
EMERGENCYSTATE_MODE           0
ORGANIZATION_TYPE             0
FLAG_DOCUMENT_2               0
FLAG_DOCUMENT_3               0
FLAG_DOCUMENT_4               0
FLAG_DOCUMENT_5               0
FLAG_DOCUMENT_6               0
FLAG_DOCUMENT_7               0
FLAG_DOCUMENT_8               0
FLAG_DOCUMENT_9               0
FLAG_DOCUMENT_10              0
FLAG_DOCUMENT_11              0
FLAG_DOCUMENT_12              0
FLAG_DOCUMENT_13              0
FLAG_DOCUMENT_14              0
FLAG_DOCUMENT_15              0
FLAG_DOCUMENT_16              0
FLAG_DOCUMENT_17              0
FLAG_DOCUMENT_18              0
FLAG_DOCUMENT_19              0
FLAG_DOCUMENT_20              0
FLAG_DOCUMENT_21              0
AMT_REQ_CREDIT_BUREAU_HOUR    0
AMT_REQ_CREDIT_BUREAU_DAY     0
AMT_REQ_CREDIT_BUREAU_WEEK    0
AMT_REQ_CREDIT_BUREAU_QRT     0
dtype: int64

Guardado de la tabla¶

In [89]:
# Rutas para guardar los archivos
train_output_path = "../data/processed_data/processed_data_input/data_train_preprocessing_missing_outlier.csv"
test_output_path = "../data/processed_data/processed_data_input/data_test_preprocessing_missing_outlier.csv"

# Crear las carpetas (si no existen todavia)
os.makedirs(os.path.dirname(train_output_path), exist_ok=True)
os.makedirs(os.path.dirname(test_output_path), exist_ok=True)

# Guardar los DataFrames como .CSV
data_train_input.to_csv(train_output_path, index=False)
data_test_input.to_csv(test_output_path, index=False)

print(f"Archivo de entrenamiento guardado en: {train_output_path}")
print(f"Archivo de prueba guardado en: {test_output_path}")
Archivo de entrenamiento guardado en: ../data/processed_data/processed_data_input/data_train_preprocessing_missing_outlier.csv
Archivo de prueba guardado en: ../data/processed_data/processed_data_input/data_test_preprocessing_missing_outlier.csv
In [90]:
print(data_train_input.shape, data_test_input.shape)
(246008, 118) (61503, 118)